--- title: GTEx v7 - Download and Basic library preparation keywords: fastai sidebar: home_sidebar nb_path: "01_GTEx.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
# logger.info('logger info')
# logger.warning('logger.warning')

import os, sys, glob
from logzero import logger
import inspect, urllib

import pandas
import scipy
import numpy as np
from pprint import pprint

import time, datetime

import seaborn as sns

import plotly.express as px

# from chart_studio import plotly as py
import plotly.figure_factory as ff
import plotly.graph_objects as go

# __file__ does not work in a jupyter notebook
# make something equivalent - actually only for this particular cell
import inspect
# __file__ does not work in a jupyter notebook
# make something equivalent - actually only for this particular cell
__file__ = os.path.abspath(inspect.getfile(lambda: None))
import pathlib
script_dir = pathlib.Path().resolve()
{% endraw %} {% raw %}
# Gene transcripts per million data

# Open a handle onto the GTEx expression data
GTEX_URL =  "https://storage.googleapis.com/gtex_analysis_v8"
GTEXV8_TPM = os.path.join(GTEX_URL, "rna_seq_data", "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz")
GTEXV8_TPM_MED = os.path.join(GTEX_URL, "rna_seq_data", "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz")
GTEX_PHENO_DS = os.path.join(GTEX_URL, "annotations", "GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt")
GTEX_PHENO_DD = os.path.join(GTEX_URL, "annotations", "GTEx_Analysis_v8_Annotations_SubjectPhenotypesDD.xlsx")
GTEX_SAMPLE_DS = os.path.join(GTEX_URL, "annotations", "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt")
GTEX_SAMPLE_DD = os.path.join(GTEX_URL, "annotations", "GTEx_Analysis_v8_Annotations_SampleAttributesDD.xlsx")

from tqdm import tqdm
import requests

cache_dir = os.path.join(script_dir, "data")

if os.path.exists(cache_dir):
    logger.info(f"Found {cache_dir}")
else:
    os.mkdir(cache_dir)

for url in [GTEXV8_TPM_MED, GTEXV8_TPM, GTEX_PHENO_DS, GTEX_PHENO_DD, GTEX_SAMPLE_DS, GTEX_SAMPLE_DD]:
    dest = os.path.join(cache_dir, os.path.basename(url))
    if os.path.exists(dest):
        logger.info(f"found existing: {dest}")
    else:
        logger.info(f"Downloading {dest}")
        response = requests.get(url, stream=True)

        with open(dest, "wb") as fh:
            for data in tqdm(response.iter_content()):
                fh.write(data)
        logger.info(f"Completed {dest}")       
[I 210115 18:47:40 <ipython-input-2-419abb4dad30>:19] Found /home/dustin/fleet_gene/data
[I 210115 18:47:40 <ipython-input-2-419abb4dad30>:26] found existing: /home/dustin/fleet_gene/data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz
[I 210115 18:47:40 <ipython-input-2-419abb4dad30>:26] found existing: /home/dustin/fleet_gene/data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
[I 210115 18:47:40 <ipython-input-2-419abb4dad30>:26] found existing: /home/dustin/fleet_gene/data/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt
[I 210115 18:47:40 <ipython-input-2-419abb4dad30>:26] found existing: /home/dustin/fleet_gene/data/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDD.xlsx
[I 210115 18:47:40 <ipython-input-2-419abb4dad30>:26] found existing: /home/dustin/fleet_gene/data/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
[I 210115 18:47:40 <ipython-input-2-419abb4dad30>:26] found existing: /home/dustin/fleet_gene/data/GTEx_Analysis_v8_Annotations_SampleAttributesDD.xlsx
{% endraw %} {% raw %}
GTEXV8_TPM = os.path.join(GTEX_URL, "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz")
GTEXV8_TPM_MED = os.path.join(GTEX_URL, "GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz")
GTEX_PHENO = os.path.join(GTEX_URL, "GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt")
GTEX_SAMPLE_DS = os.path.join(GTEX_URL, "GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt")
GTEX_SAMPLE_DD = os.path.join(GTEX_URL, "GTEx_Analysis_v8_Annotations_SampleAttributesDD.xlsx")

# Detailed data file:
gtex_tpm_fn = os.path.join(cache_dir, os.path.basename(GTEXV8_TPM))

# Just looking at median values by tissue (SMTSD) greatly reduces file size
gtex_tpm_med_fn = os.path.join(cache_dir, os.path.basename(GTEXV8_TPM_MED))


# Sample annotation data

# Subject Phenotype data
# Age, sex and Hardy Scale death circumstances
gtex_pheno_fn =  os.path.join(cache_dir, os.path.basename(GTEX_PHENO))

# Main sample data of interest is:
# 'SMTS': 'Tissue Type, area from which the tissue sample was taken.  This is a parent value to SMTSD.'
# 'SMTSD': 'SMTS Detailed'
gtex_attr_fn = os.path.join(cache_dir, os.path.basename(GTEX_SAMPLE_DS))
gtex_attr_desc_fn = os.path.join(cache_dir, os.path.basename(GTEX_SAMPLE_DD))

# Load tissue specific detatils
gtex_attr = pandas.read_csv(gtex_attr_fn, sep='\t')
tissue_types = sorted(gtex_attr['SMTS'].unique())
tissue_dict = dict()
for tissue in tissue_types:
    tissue_dict[tissue] = dict()
    tissue_dict[tissue]['samples'] = gtex_attr[gtex_attr['SMTS'] == tissue]['SAMPID'].to_list()
    tissue_dict[tissue]['subtypes'] = dict()
    for subtype in gtex_attr[gtex_attr['SMTS'] == tissue]['SMTSD'].unique():
        # Get all samples by subtype
        tissue_dict[tissue]['subtypes'][subtype] = gtex_attr[gtex_attr['SMTSD'] == subtype]['SAMPID'].to_list()
    

    sorted(gtex_attr[gtex_attr['SMTS'] == tissue]['SMTSD'].unique())
tissue_subtypes = sorted(gtex_attr['SMTSD'].unique())

# log number of tissue samples
logger.info('%s separate tissue types - total samples %s', len(tissue_dict), len(gtex_attr['SAMPID']))
for tissue in sorted(tissue_dict.keys()):
    logger.info('%s (n=%s)', tissue, len(tissue_dict[tissue]['samples']))
    if len(tissue_dict[tissue]['subtypes']) > 1:
        for subtype in sorted(tissue_dict[tissue]['subtypes'].keys()):
            logger.info('\t%s (n=%s)', subtype, len(tissue_dict[tissue]['subtypes'][subtype]))
# logger.info(pprint.pformat(tissue_dict))

# Other annotation data

# Mutually exclusive 
# Which study was this from?
# mut_info_fn = os.path.join(cache_dir, 'enrichments-analysis-result.txt')
#mut_info = pandas.read_csv(mut_info_fn, sep='\t')

# Human housekeeping gene - stable expression

## https://www.cell.com/trends/genetics/pdf/S0168-9525(13)00089-9.pdf - Human housekeeping genes, revisited

# https://m.tau.ac.il/~elieis/HKG/HK_genes.txt
# E. Eisenberg and E.Y. Levanon, Trends in Genetics, 29 (2013)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:43] 31 separate tissue types - total samples 22951
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Adipose Tissue (n=1327)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Adipose - Subcutaneous (n=763)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Adipose - Visceral (Omentum) (n=564)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Adrenal Gland (n=275)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Bladder (n=21)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Blood (n=3480)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Cells - EBV-transformed lymphocytes (n=192)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Whole Blood (n=3288)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Blood Vessel (n=1473)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Artery - Aorta (n=450)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Artery - Coronary (n=253)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Artery - Tibial (n=770)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Bone Marrow (n=217)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Brain (n=3326)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Brain - Amygdala (n=177)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Brain - Anterior cingulate cortex (BA24) (n=213)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Brain - Caudate (basal ganglia) (n=291)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Brain - Cerebellar Hemisphere (n=263)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Brain - Cerebellum (n=298)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Brain - Cortex (n=325)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Brain - Frontal Cortex (BA9) (n=425)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Brain - Hippocampus (n=243)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Brain - Hypothalamus (n=236)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Brain - Nucleus accumbens (basal ganglia) (n=277)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Brain - Putamen (basal ganglia) (n=232)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Brain - Spinal cord (cervical c-1) (n=182)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Brain - Substantia nigra (n=164)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Breast (n=480)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Cervix Uteri (n=19)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Cervix - Ectocervix (n=9)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Cervix - Endocervix (n=10)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Colon (n=821)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Colon - Sigmoid (n=389)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Colon - Transverse (n=432)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Esophagus (n=1582)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Esophagus - Gastroesophageal Junction (n=401)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Esophagus - Mucosa (n=622)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Esophagus - Muscularis (n=559)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Fallopian Tube (n=9)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Heart (n=1141)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Heart - Atrial Appendage (n=452)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Heart - Left Ventricle (n=689)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Kidney (n=104)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Kidney - Cortex (n=100)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Kidney - Medulla (n=4)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Liver (n=251)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Lung (n=867)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Muscle (n=1132)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Nerve (n=722)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Ovary (n=195)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Pancreas (n=360)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Pituitary (n=301)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Prostate (n=262)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Salivary Gland (n=181)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Skin (n=2014)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Cells - Cultured fibroblasts (n=527)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Skin - Not Sun Exposed (Suprapubic) (n=638)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:48] 	Skin - Sun Exposed (Lower leg) (n=849)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Small Intestine (n=193)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Spleen (n=260)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Stomach (n=381)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Testis (n=406)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Thyroid (n=812)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Uterus (n=166)
[I 210115 18:47:41 <ipython-input-3-2687d237638c>:45] Vagina (n=173)
{% endraw %} {% raw %}
# The full set of all all GTEx tpms is very large and can be hard to work with
# Some functions only load some of the data at a time.

# Genes of interest - some test genes 
TEST_GENES = [
    'TP53',
    'ERBB2',  # Herceptin target
    'EGFR',
    'AKT1',
    'KRAS',
    'PTEN',
    'APOE',
]

TUMOUR_MUT_GENES = [  # www.tumourportal.org - highly mutated
    'TP53',
    'PIK3CA',
    'PTEN',
    'KRAS',
    'APC',
    'MLL3',  # aka KMT2C
    'KMT2C',
    'FAT1',
    'MLL2',  # aka KMT2D
    'KMT2D',
    'ARID1A',
    'VHL',
    'PBRM1',
    'NF1',
    'EGFR',
    'ATM',
    'PIK3R1',
    'BRAF',
    'CDKN2A',
    'SETD2',
    'CREBBP',
    'FBXW7',
    'SPEN',
    'MTOR',
    'RB1',
    'SMARCA4',
    'NOTCH1',
]

TEST_GENES = sorted(set(TEST_GENES + TUMOUR_MUT_GENES))

# gtex_tpm file has genes as rows.
# First two columns are gene information
# Over 10000 columns of samples

#                Name Description  GTEX-1117F-0226-SM-5GZZ7  GTEX-111CU-1826-SM-5GZYN ...
#0  ENSG00000223972.4     DDX11L1                   0.10820                   0.11580
#1  ENSG00000227232.4      WASH7P                  21.40000                  11.03000


GTPM_SKIP = 2  #  skip two comment rows at the beginning of the file

# Just getting a few rows of this giant file
logger.info('Loading Row partial table ...')
gtpm_row_table = pandas.read_csv(gtex_tpm_fn, skiprows=2, sep='\t', usecols=range(4))
logger.info('Loading column partial table ...')
gtpm_column_table = pandas.read_csv(gtex_tpm_fn, skiprows=lambda x: x not in [2, 3], sep='\t')

def gtpm_row_by_genename(gene_name):
    sub_table = gtpm_row_table[gtpm_row_table['Description'] == gene_name]
    if sub_table.empty:
        logger.error("Missing '%s row in GTEX TPM table'", gene_name)
        return None
    return [GTPM_SKIP + 1 + i for i in sub_table.index][0]

def gtpm_keep_rows(gene_list=TEST_GENES):
    row_list = [gtpm_row_by_genename(gene) for gene in gene_list]
    return [GTPM_SKIP] + [gene for gene in row_list if gene]


def gtex_tpm_partial_table_load(gene_list, sample_list=None):
    start_time = time.time()
    if gene_list:
        keep_rows = gtpm_keep_rows(gene_list)
        skiprows_func = lambda x: x not in keep_rows
        logger.info('loading %s GTEX TPM rows', len(keep_rows))
    else:
        logger.info('loading all rows')
        skiprows_func = lambda x: x in [0, 1]
    
    if sample_list:
        logger.info('Limiting load to %s potential samples', len(sample_list))
        # Not all samples are in tables
        sample_list = [s for s in sample_list if s in gtpm_column_table.columns]
        logger.info('Limiting load to %s samples', len(sample_list))
        columns = ['Name', 'Description'] + sample_list
    else:
        sample_list = None
    
    gtpm_partial = pandas.read_csv(
        gtex_tpm_fn,
        skiprows=skiprows_func,  # Only selected gene rows
        sep='\t',
        header=0,
        usecols=sample_list,  # Number of samples limit
    )
    end_time = time.time()
    logger.info("Gtex TPM partial table took {} minutes".format((end_time - start_time)/60))
    return gtpm_partial
[I 210115 18:47:41 <ipython-input-4-b213538e8d2c>:61] Loading Row partial table ...
[I 210115 18:48:17 <ipython-input-4-b213538e8d2c>:63] Loading column partial table ...
{% endraw %} {% raw %}
logger.info(f"loading median GTEx tissue TPM values: {gtex_tpm_med_fn}")
gtpm_med = pandas.read_csv(gtex_tpm_med_fn, sep='\t', skiprows=2, low_memory=False)
# gtpm_breast = gtex_tpm_partial_table_load(gene_list=TEST_GENES, sample_list=tissue_dict['Breast']['samples'])

logger.info(f"loading GTEX TPMs for {len(TEST_GENES)} genes")
logger.info(f" GTEX GENES {TEST_GENES}")
gtpm = gtex_tpm_partial_table_load(gene_list=TEST_GENES,)

# Remove missing genes from the test
TEST_GENES = gtpm['Description'].to_list()
[I 210115 18:48:48 <ipython-input-5-cd9953dd7a82>:2] loading median GTEx tissue TPM values: /home/dustin/fleet_gene/data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz
[I 210115 18:48:48 <ipython-input-5-cd9953dd7a82>:6] loading GTEX TPMs for 30 genes
[I 210115 18:48:48 <ipython-input-5-cd9953dd7a82>:7]  GTEX GENES ['AKT1', 'APC', 'APOE', 'ARID1A', 'ATM', 'BRAF', 'CDKN2A', 'CREBBP', 'EGFR', 'ERBB2', 'FAT1', 'FBXW7', 'KMT2C', 'KMT2D', 'KRAS', 'MLL2', 'MLL3', 'MTOR', 'NF1', 'NOTCH1', 'PBRM1', 'PIK3CA', 'PIK3R1', 'PTEN', 'RB1', 'SETD2', 'SMARCA4', 'SPEN', 'TP53', 'VHL']
[E 210115 18:48:48 <ipython-input-4-b213538e8d2c>:69] Missing 'MLL2 row in GTEX TPM table'
[E 210115 18:48:48 <ipython-input-4-b213538e8d2c>:69] Missing 'MLL3 row in GTEX TPM table'
[I 210115 18:48:48 <ipython-input-4-b213538e8d2c>:83] loading 29 GTEX TPM rows
[I 210115 18:49:19 <ipython-input-4-b213538e8d2c>:105] Gtex TPM partial table took 0.5178065180778504 minutes
{% endraw %}

Median TPM values and Tissue comparison

The smaller table of median TPM values can give us a good idea of the overall character of the different genes.

The spread includes extreme values, most of the tissues are similar.

Mean is ~ 16 - 17 TPM, but 50th percentile is 0 and 75th percentile is ~ 2.

This indicates that the TPM is not a normal distribution, but has a small number of extreme values.

{% raw %}
# Do something with the median values

rename_cols = dict()
for t in tissue_dict.keys():
    # pprint(tissue_dict[t]['subtypes'].keys())
    # pick a single representative tissue subtype
    subtype =  sorted(tissue_dict[t]['subtypes'].keys())[-1]
    if subtype != 'Cells - Leukemia cell line (CML)':
        rename_cols[subtype] = t
sample_columns = tuple(t
     for t in tissue_dict.keys()
    )

# gm = gtpm_med.drop('gene_id', axis=1).set_index('Description')
gm = gtpm_med.set_index('Description')

gm.rename(columns=rename_cols, inplace=True)
gm = gm[sorted(rename_cols.values())]
# Copy table so we can add and modify values
gm = gm.copy()

# Using gm table we can chacterize the genes a bit.
# the spread includes extreme values, most of the tissues are similar
# Mean is ~ 16 - 17 TPM, but 50th percentile is 0 and 75th percentile is ~ 2
pandas.set_option('max_columns', 60)
pandas.set_option('display.width', 120)
#print(gm.describe())
print(gm.loc[TEST_GENES].head())
             Adipose Tissue  Adrenal Gland  Bladder     Blood  Blood Vessel    Brain   Breast  Cervix Uteri    Colon  \
Description                                                                                                            
MTOR                12.5037        12.8676  15.9141   3.03492       17.6186  7.33520  14.5158       15.8568  13.3463   
SPEN                22.1260        13.2763  27.9158   7.53548       51.9570  9.19060  29.1025       32.0656  17.8085   
ARID1A              21.5751        21.1199  32.2639  11.15880       31.4493  8.96392  25.8875       31.9771  22.2391   
VHL                 14.1879        15.5978  15.0551  13.23610       11.6079  7.35583  16.7531       22.6914  13.1580   
SETD2               21.3039        16.6784  28.6666   9.20532       33.9075  8.32916  25.3699       35.7986  20.8747   

             Esophagus  Fallopian Tube    Heart   Kidney     Liver     Lung    Muscle    Nerve    Ovary  Pancreas  \
Description                                                                                                         
MTOR           18.9537         16.2413  7.55295  11.1177   7.47069  15.2756  15.04980  19.7788  14.1306   5.90858   
SPEN           25.3144         38.2886  6.73727  15.0826   8.25575  32.8227  16.85600  43.1718  45.3009   9.81830   
ARID1A         23.2820         33.7390  9.33664  14.9593  10.04250  31.0712  13.78600  39.7431  36.4904  10.18270   
VHL            11.5110         18.0739  3.84112  10.7547   7.38084  20.3767   6.29122  17.9951  20.2502   7.16831   
SETD2          25.8276         31.1203  6.62032  14.8113   9.18743  28.1247  17.83440  32.8212  36.9143   9.46797   

             Pituitary  Prostate  Salivary Gland     Skin  Small Intestine   Spleen  Stomach   Testis  Thyroid  \
Description                                                                                                      
MTOR           17.6921   16.5626         12.8679  13.2190          15.2865  15.0441  11.4312  35.5128  19.0889   
SPEN           22.8996   23.6944         22.2075  31.8278          23.2464  21.8492  14.0972  22.6114  37.8968   
ARID1A         24.0102   25.9706         26.1924  30.1194          26.9634  37.3643  21.8782  34.5622  42.7936   
VHL            15.0265   17.3962         17.0114  16.8374          18.6890  24.0985  11.1474  15.1527  20.5107   
SETD2          22.9113   25.8065         21.3374  28.2580          24.6022  23.1161  16.3570  34.8691  35.9564   

              Uterus   Vagina  
Description                    
MTOR         20.0622  13.7552  
SPEN         40.1904  30.5660  
ARID1A       38.9495  31.1372  
VHL          20.8106  18.9288  
SETD2        38.8762  28.8251  
{% endraw %} {% raw %}
# Apply some known values
gm['mean'] = gm.apply(lambda row: row.mean(), axis=1)
gm['median'] = gm.apply(lambda row: row.median(), axis=1)
gm['std'] = gm.apply(lambda row: row.std(), axis=1)
#gm['mode'] = gm.apply(lambda row: row.mode(), axis=1)
gm['CoV'] = gm.apply(lambda row: np.inf if row['mean'] == 0 else row['std']/row['mean'], axis=1)
gm['sem'] = gm.apply(lambda row: row.sem(), axis=1)
gm.head()

# high expression # low deviation among tissues
gm_bench = gm[(gm['median'] >= 1)  # high expression
             & (gm['median'] - 3.5 * gm['std'] > 0)
            ]

#cross_tissue_genes = sorted(gm_bench['Description'].tolist())

print("Of {} genes, the top {} in stable cross-tissue expression are:".format(len(gm), len(gm_bench)))
#for gene in cross_tissue_genes:
#    print('\t{}'.format(gene))
gm_bench.sort_values(by='CoV')
Of 56200 genes, the top 377 in stable cross-tissue expression are:
Adipose Tissue Adrenal Gland Bladder Blood Blood Vessel Brain Breast Cervix Uteri Colon Esophagus Fallopian Tube Heart Kidney Liver Lung Muscle Nerve Ovary Pancreas Pituitary Prostate Salivary Gland Skin Small Intestine Spleen Stomach Testis Thyroid Uterus Vagina mean median std CoV sem
Description
AP3S2 14.0070 17.69710 16.20940 12.40220 15.66370 13.38010 15.96270 16.46960 15.65950 16.02500 14.34720 11.33360 12.74180 10.27910 14.62880 18.67900 17.11230 16.23260 8.48215 16.24050 16.70710 11.93140 14.49820 11.34020 12.10220 10.43030 15.95040 17.64000 16.26500 13.00090 14.447302 14.628800 2.466756 0.170742 0.678146
SURF1 42.7694 52.09010 59.76620 32.08450 64.98730 38.37360 47.46350 49.70920 45.72520 57.29880 52.28280 28.44280 42.27440 42.51620 47.51560 37.01600 57.39880 47.08930 35.93650 57.67430 57.90280 43.49840 44.70370 46.31640 53.18220 55.45690 54.00630 67.20690 50.01400 41.77260 48.415823 47.515600 8.949101 0.184838 2.338488
LAMTOR2 39.9044 46.17200 38.89370 24.38170 35.72260 25.59660 37.53480 38.59410 35.08200 37.79260 34.73450 22.43390 42.32350 42.43670 44.01850 23.91700 41.61220 22.20760 22.79470 32.64740 39.18190 34.87960 35.68140 33.03380 43.26920 37.14770 26.70690 39.25180 33.62330 35.06650 34.888087 35.681400 6.743300 0.193284 1.712980
POLR2J 41.1272 54.06690 44.71260 35.81550 47.65950 40.29050 42.46560 51.49120 39.61180 46.40860 48.07120 36.48010 38.51530 31.26050 42.47600 63.05530 47.23780 57.23900 18.07600 60.06370 56.45340 37.83050 44.43740 38.48440 49.19190 37.32010 47.91630 57.69100 51.10510 45.61120 45.072187 45.072187 9.071866 0.201274 2.247662
CLPP 64.1126 77.43430 66.25410 34.33740 72.61000 34.82350 64.98880 68.50960 55.84680 69.26330 76.22850 40.74900 52.49170 50.44760 67.43170 74.36650 69.23090 65.13600 29.53380 69.29990 66.99440 51.24180 66.27830 58.56210 63.92240 56.36960 75.40720 66.66060 71.42990 61.52190 61.382807 65.136000 12.375733 0.201616 3.067861
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
FAM120A 49.1946 54.06830 68.98200 21.97120 57.44420 16.91650 47.50630 61.52950 58.28650 53.96880 61.73320 23.48150 29.93730 24.86210 58.66130 39.25620 48.18930 55.37450 25.72900 40.30790 50.07350 44.56680 39.58140 50.72620 48.87870 42.22360 21.01310 59.00910 63.18700 51.25730 45.597230 48.878700 13.962732 0.306219 2.822202
PRCC 46.6674 38.06680 56.71630 32.38660 58.68800 21.94350 52.10840 70.86710 41.59450 51.82400 61.19580 20.21270 38.78520 18.57160 53.78460 44.05480 60.99200 58.79670 15.67790 47.59190 56.98280 39.89930 62.63500 44.15650 54.53750 35.35290 55.80860 62.38860 72.75780 56.62980 47.722487 51.824000 14.627257 0.306507 2.956161
PEX26 4.9807 6.07572 6.13928 1.78087 6.07091 4.37476 5.96001 6.93495 7.11902 6.19398 6.45258 1.91616 4.89677 3.60670 6.41964 2.62936 6.54088 6.15766 2.54451 7.39641 6.73685 5.71402 6.66166 9.03487 6.55887 4.69171 3.67475 8.05696 7.53741 6.34875 5.640224 6.139280 1.741597 0.308782 0.347409
WDR55 18.8493 15.24400 22.54880 13.81490 18.19360 6.49282 23.59090 25.65210 20.63950 16.67840 23.72370 6.24822 16.25960 9.78975 26.16740 9.54263 24.61160 22.20200 10.38140 22.06600 22.71970 22.24960 25.77030 24.52200 29.51100 17.79570 14.11240 22.74260 23.86330 22.33450 19.277257 22.066000 6.005755 0.311546 1.204829
DCTN4 25.9578 31.52040 31.27010 10.49070 33.02930 24.13200 27.44270 34.15950 17.87020 25.41670 31.20850 10.80570 13.84910 9.76934 29.28830 28.19350 29.51390 41.25800 13.60460 33.52740 29.02490 20.02790 25.56590 18.10220 18.56860 19.70800 29.75860 28.58580 33.94350 27.63430 25.107581 27.442700 7.822858 0.311574 1.569661

377 rows × 35 columns

{% endraw %} {% raw %}
gm.sort_values(by='sem').head(30)
Adipose Tissue Adrenal Gland Bladder Blood Blood Vessel Brain Breast Cervix Uteri Colon Esophagus Fallopian Tube Heart Kidney Liver Lung Muscle Nerve Ovary Pancreas Pituitary Prostate Salivary Gland Skin Small Intestine Spleen Stomach Testis Thyroid Uterus Vagina mean median std CoV sem
Description
HSPA8P1 0.148095 0.123131 0.101509 0.077584 0.124232 0.084888 0.095281 0.079885 0.103350 0.142390 0.087713 0.111385 0.181358 0.056784 0.175487 0.209438 0.129799 0.157366 0.128915 0.150621 0.082037 0.124029 0.064946 0.114999 0.129898 0.146765 0.199473 0.136738 0.115292 0.080004 0.122113 0.123131 0.037273 0.305234 0.008627
HSPA8P9 0.055016 0.046590 0.037705 0.025902 0.046430 0.032752 0.031669 0.000000 0.035711 0.054096 0.033078 0.034395 0.049115 0.019232 0.067903 0.069458 0.045968 0.055416 0.049676 0.065675 0.042739 0.037472 0.021811 0.039438 0.051874 0.052205 0.061373 0.054629 0.041686 0.031114 0.043004 0.043004 0.014850 0.345317 0.009289
EEF1B2P1 0.125402 0.101223 0.149451 0.039746 0.090734 0.051101 0.129224 0.195992 0.104318 0.096518 0.157038 0.028352 0.081832 0.043633 0.117102 0.107330 0.138389 0.189700 0.137283 0.114675 0.120355 0.093698 0.171082 0.136991 0.159127 0.079997 0.109012 0.092557 0.147776 0.158139 0.115593 0.115593 0.040933 0.354115 0.010063
RP11-543B16.2 0.158196 0.144616 0.183101 0.247400 0.160438 0.071693 0.157943 0.215711 0.177776 0.149606 0.077967 0.037474 0.189248 0.144738 0.183302 0.000000 0.170458 0.156512 0.158476 0.153295 0.155455 0.133929 0.211680 0.203912 0.202416 0.131455 0.146921 0.166900 0.148995 0.144303 0.152797 0.156512 0.049462 0.323711 0.010150
HSPA8P5 0.066848 0.089790 0.059954 0.036590 0.060331 0.050960 0.043527 0.052955 0.055253 0.069665 0.076508 0.051104 0.057314 0.033588 0.092814 0.102331 0.087680 0.072796 0.160371 0.083803 0.050103 0.050841 0.040348 0.060595 0.074068 0.072449 0.136668 0.073620 0.058382 0.043460 0.068824 0.060595 0.026933 0.391339 0.010600
TATDN1P1 0.132253 0.070601 0.077971 0.053560 0.140322 0.053817 0.163642 0.092057 0.087597 0.092915 0.193201 0.054994 0.139971 0.037159 0.144888 0.130620 0.171131 0.204788 0.079262 0.086326 0.134622 0.092439 0.122388 0.099247 0.089259 0.095142 0.091371 0.174909 0.162705 0.155767 0.114164 0.099247 0.042997 0.376628 0.010779
HNRNPKP1 0.164346 0.164056 0.164873 0.140656 0.201567 0.064785 0.173174 0.127135 0.139325 0.277672 0.093213 0.052150 0.117979 0.079996 0.200467 0.148183 0.216425 0.255880 0.071358 0.205526 0.155760 0.182666 0.214414 0.167831 0.181314 0.124820 0.238360 0.217577 0.201248 0.161986 0.163491 0.164346 0.054221 0.331647 0.010816
UBBP1 0.127087 0.183315 0.100414 0.262166 0.214407 0.134830 0.123228 0.093768 0.124363 0.144533 0.128086 0.093546 0.131833 0.138210 0.215061 0.170352 0.097268 0.126788 0.073517 0.199578 0.139729 0.114847 0.099132 0.130106 0.132152 0.151962 0.317259 0.204596 0.120936 0.135990 0.147635 0.132152 0.051662 0.349931 0.010873
HMGB1P8 0.157683 0.129000 0.172083 0.102700 0.196170 0.082661 0.164879 0.128029 0.120055 0.142930 0.227439 0.061415 0.125853 0.138483 0.208194 0.108271 0.193273 0.229627 0.061702 0.189921 0.225670 0.147384 0.101221 0.142336 0.202285 0.115324 0.200258 0.171839 0.314326 0.199431 0.158681 0.157683 0.054219 0.341682 0.010983
FAM45BP 0.111674 0.086921 0.048982 0.100251 0.082137 0.072242 0.108412 0.086065 0.083946 0.060847 0.074231 0.032365 0.069735 0.044107 0.121526 0.061888 0.162843 0.161808 0.025993 0.060771 0.104949 0.081558 0.138734 0.082744 0.084577 0.057290 0.065431 0.127675 0.102416 0.165707 0.088927 0.083946 0.035419 0.398294 0.010994
F8A3 0.043233 0.033447 0.066199 0.048827 0.067318 0.017369 0.055740 0.066727 0.043613 0.041678 0.038270 0.017221 0.007798 0.028621 0.055626 0.028754 0.067952 0.030278 0.018196 0.035788 0.041752 0.043102 0.029434 0.047969 0.051787 0.024227 0.022125 0.071258 0.050233 0.036990 0.041051 0.041678 0.016495 0.401829 0.011004
RP6-218J18.2 0.122398 0.099849 0.156031 0.088189 0.130925 0.042971 0.129008 0.143502 0.105336 0.111983 0.165927 0.038531 0.185980 0.064579 0.204029 0.057842 0.187265 0.210339 0.045657 0.163471 0.170595 0.111308 0.122707 0.133023 0.184607 0.101659 0.174072 0.221449 0.173075 0.132937 0.132641 0.132641 0.049388 0.372342 0.011142
EI24P2 0.084741 0.047093 0.105650 0.000000 0.105824 0.031119 0.079675 0.100854 0.079285 0.089354 0.133730 0.032436 0.000000 0.125960 0.083467 0.112686 0.074401 0.102864 0.057016 0.065261 0.069090 0.080236 0.121177 0.069154 0.056440 0.078108 0.087839 0.062519 0.104465 0.107599 0.078268 0.079675 0.032086 0.409946 0.011235
LYPLA2P1 0.067001 0.087040 0.039106 0.114603 0.068219 0.000000 0.103024 0.130422 0.110890 0.056668 0.075812 0.034421 0.084973 0.067389 0.156049 0.057885 0.106090 0.076897 0.035974 0.082373 0.118185 0.130795 0.137262 0.104327 0.134403 0.092327 0.145336 0.149179 0.069550 0.119377 0.091853 0.091853 0.037202 0.405020 0.011250
BCLAF1P2 0.080648 0.067342 0.068520 0.028160 0.111809 0.020136 0.089533 0.090660 0.056146 0.059145 0.096517 0.019585 0.047475 0.021224 0.087388 0.052297 0.099547 0.144015 0.030313 0.058621 0.062189 0.090543 0.080957 0.070798 0.061655 0.047777 0.061997 0.105634 0.108120 0.078219 0.069899 0.068520 0.028989 0.414732 0.011327
bP-21264C1.2 0.083331 0.106339 0.116904 0.018674 0.113708 0.066459 0.106726 0.119785 0.084311 0.100700 0.228053 0.060423 0.203898 0.049444 0.102063 0.069661 0.134099 0.163363 0.064802 0.159095 0.117522 0.091280 0.120944 0.095055 0.084537 0.095482 0.164100 0.184001 0.125072 0.122715 0.111752 0.106726 0.044235 0.395833 0.011353
AC008985.1 0.031527 0.033872 0.039921 0.017734 0.019498 0.011847 0.040592 0.054291 0.034069 0.036859 0.016067 0.007204 0.030808 0.022515 0.051608 0.021692 0.069993 0.049559 0.017105 0.044464 0.051277 0.039878 0.058685 0.046286 0.049681 0.031492 0.027968 0.051031 0.066035 0.055190 0.037625 0.037625 0.015861 0.421562 0.011632
SETP8 0.139041 0.170444 0.197523 0.000000 0.176669 0.094897 0.139809 0.189616 0.139962 0.155717 0.163631 0.066409 0.175130 0.033319 0.198860 0.183748 0.180280 0.246842 0.078209 0.151014 0.095990 0.218402 0.163321 0.169240 0.148156 0.133919 0.205709 0.183319 0.273475 0.185553 0.155273 0.163631 0.056685 0.365066 0.011675
RPSAP4 0.119289 0.088486 0.100658 0.076335 0.126333 0.054995 0.134461 0.170190 0.122655 0.114883 0.162042 0.037677 0.053415 0.063286 0.136104 0.070664 0.151237 0.292081 0.117590 0.106530 0.134705 0.097330 0.179975 0.123395 0.138825 0.108156 0.060727 0.148512 0.166217 0.150328 0.120236 0.120236 0.048253 0.401317 0.011757
SLX1B 0.120469 0.120866 0.164840 0.150058 0.084326 0.070363 0.134317 0.228225 0.152368 0.101207 0.195422 0.042685 0.102439 0.124323 0.216225 0.071079 0.153167 0.144603 0.054925 0.201529 0.184997 0.151511 0.108982 0.218990 0.280232 0.111934 0.117604 0.235386 0.168631 0.133754 0.144849 0.144603 0.055192 0.381032 0.011853
CTA-212A2.3 0.007064 0.002835 0.004089 0.002995 0.008111 0.002257 0.008151 0.009699 0.004799 0.005029 0.004410 0.003570 0.003760 0.006415 0.012113 0.005871 0.009283 0.007818 0.001923 0.006268 0.006456 0.004407 0.006260 0.004417 0.007229 0.003602 0.010172 0.007464 0.010010 0.007950 0.006148 0.006260 0.002515 0.409183 0.011865
HNRNPKP2 0.226586 0.203968 0.239188 0.208468 0.301146 0.091563 0.237114 0.287210 0.195207 0.237491 0.189802 0.080331 0.162242 0.083431 0.308751 0.176776 0.272485 0.294336 0.074342 0.206229 0.182324 0.200034 0.236116 0.208714 0.212602 0.159456 0.172059 0.279354 0.301419 0.253123 0.209396 0.208714 0.064145 0.306333 0.011866
PPIAL4E 0.082913 0.077281 0.072407 0.000000 0.101606 0.049005 0.069562 0.102859 0.064646 0.071577 0.136542 0.018322 0.088589 0.042976 0.102652 0.000000 0.070285 0.077666 0.032445 0.059785 0.067451 0.054723 0.109508 0.062345 0.092461 0.051373 0.048416 0.077484 0.081808 0.104151 0.069028 0.070285 0.030001 0.434622 0.011939
RP11-144L1.8 0.047031 0.054366 0.057242 0.016166 0.048636 0.043605 0.036488 0.020762 0.034358 0.031351 0.027958 0.023844 0.022586 0.023726 0.065530 0.031585 0.044631 0.048840 0.013678 0.054032 0.031368 0.041147 0.036610 0.029422 0.025938 0.032201 0.102621 0.044623 0.053435 0.033052 0.039228 0.036488 0.017041 0.434420 0.012003
CTD-3126B10.5 0.088240 0.124587 0.153419 0.063805 0.074694 0.032039 0.114412 0.125647 0.114380 0.103779 0.107588 0.039684 0.018799 0.047577 0.175522 0.044675 0.185960 0.201797 0.072022 0.129237 0.132183 0.154850 0.157869 0.171636 0.112283 0.092744 0.158688 0.174038 0.165612 0.157980 0.116525 0.116525 0.048716 0.418078 0.012218
RPSAP8 0.094640 0.070004 0.100544 0.047804 0.092002 0.043391 0.101640 0.099563 0.091700 0.088132 0.178059 0.024973 0.055056 0.032711 0.111545 0.051479 0.109262 0.235455 0.086218 0.078294 0.117707 0.091901 0.122591 0.089952 0.117309 0.083677 0.050508 0.122270 0.143301 0.109784 0.094716 0.092002 0.041203 0.435020 0.012270
RPL4P3 0.089819 0.046325 0.076606 0.047240 0.147652 0.035552 0.109989 0.124002 0.071919 0.092015 0.098047 0.035782 0.074384 0.022336 0.096751 0.095299 0.114650 0.212309 0.051907 0.088221 0.099590 0.081180 0.143903 0.058735 0.059065 0.069280 0.071439 0.102624 0.155554 0.100773 0.089098 0.089098 0.039080 0.438613 0.012286
EIF4A1P4 0.203029 0.113654 0.124857 0.096675 0.149577 0.030390 0.166548 0.102091 0.080646 0.091190 0.149838 0.035103 0.058656 0.069692 0.205180 0.053664 0.188260 0.165346 0.056453 0.089097 0.113432 0.109386 0.200548 0.093466 0.113044 0.101719 0.201463 0.105891 0.128309 0.196819 0.119801 0.113044 0.050676 0.422999 0.012482
OLA1P1 0.139019 0.160729 0.175288 0.053304 0.234279 0.150090 0.169828 0.237696 0.145506 0.152398 0.221887 0.068372 0.101237 0.222897 0.170241 0.199561 0.183287 0.202325 0.105605 0.392311 0.214107 0.151109 0.239039 0.190016 0.107512 0.132292 0.274363 0.260304 0.181059 0.225662 0.182044 0.181059 0.065164 0.357956 0.012530
CH17-258A22.4 0.055064 0.067146 0.112708 0.027988 0.079803 0.026416 0.076402 0.126419 0.059093 0.088557 0.104145 0.018845 0.078197 0.025120 0.106421 0.046698 0.123876 0.132791 0.022448 0.094247 0.107325 0.071940 0.114496 0.074145 0.081796 0.044894 0.064617 0.111500 0.125146 0.168718 0.081232 0.079803 0.036742 0.452312 0.012610
{% endraw %} {% raw %}
gm.loc[TEST_GENES]
Adipose Tissue Adrenal Gland Bladder Blood Blood Vessel Brain Breast Cervix Uteri Colon Esophagus Fallopian Tube Heart Kidney Liver Lung Muscle Nerve Ovary Pancreas Pituitary Prostate Salivary Gland Skin Small Intestine Spleen Stomach Testis Thyroid Uterus Vagina mean median std CoV sem
Description
MTOR 12.503700 12.867600 15.91410 3.034920 17.618600 7.335200 14.515800 15.856800 13.346300 18.95370 16.24130 7.552950 11.117700 7.470690 15.27560 15.049800 19.77880 14.130600 5.908580 17.69210 16.562600 12.867900 13.219000 15.28650 15.04410 11.431200 35.51280 19.088900 20.062200 13.75520 14.499841 14.515800 5.571234 0.384227 1.045136
SPEN 22.126000 13.276300 27.91580 7.535480 51.957000 9.190600 29.102500 32.065600 17.808500 25.31440 38.28860 6.737270 15.082600 8.255750 32.82270 16.856000 43.17180 45.300900 9.818300 22.89960 23.694400 22.207500 31.827800 23.24640 21.84920 14.097200 22.61140 37.896800 40.190400 30.56600 24.790427 23.246400 11.599147 0.467888 2.087982
ARID1A 21.575100 21.119900 32.26390 11.158800 31.449300 8.963920 25.887500 31.977100 22.239100 23.28200 33.73900 9.336640 14.959300 10.042500 31.07120 13.786000 39.74310 36.490400 10.182700 24.01020 25.970600 26.192400 30.119400 26.96340 37.36430 21.878200 34.56220 42.793600 38.949500 31.13720 25.640282 25.970600 9.557742 0.372763 1.810908
VHL 14.187900 15.597800 15.05510 13.236100 11.607900 7.355830 16.753100 22.691400 13.158000 11.51100 18.07390 3.841120 10.754700 7.380840 20.37670 6.291220 17.99510 20.250200 7.168310 15.02650 17.396200 17.011400 16.837400 18.68900 24.09850 11.147400 15.15270 20.510700 20.810600 18.92880 14.963181 15.152700 4.979040 0.332753 0.974132
SETD2 21.303900 16.678400 28.66660 9.205320 33.907500 8.329160 25.369900 35.798600 20.874700 25.82760 31.12030 6.620320 14.811300 9.187430 28.12470 17.834400 32.82120 36.914300 9.467970 22.91130 25.806500 21.337400 28.258000 24.60220 23.11610 16.357000 34.86910 35.956400 38.876200 28.82510 23.792630 24.602200 9.089350 0.382024 1.710937
PBRM1 18.588900 11.419700 21.28370 3.254440 26.754300 5.085370 18.675900 22.984100 14.183100 19.52040 24.06940 5.681480 6.488490 5.852270 17.65820 9.020570 25.10470 28.074500 6.016860 15.71500 15.890900 14.656900 18.446100 13.71280 15.39450 9.864260 16.15270 20.880100 27.007700 20.85120 15.942951 15.942951 6.930256 0.434691 1.264112
PIK3CA 18.194600 8.240050 14.86330 4.151520 23.187600 3.542820 17.863300 15.075700 8.536900 14.88310 15.21330 3.594430 4.734540 3.104470 15.63770 6.704610 21.42070 15.508700 3.082790 7.35694 9.887330 9.196970 12.534500 9.36220 10.40090 6.203630 8.91973 12.288400 19.116200 14.94390 11.258361 10.400900 5.530710 0.491254 0.984802
FBXW7 9.373310 5.191610 8.69599 3.572720 9.625750 5.404650 11.687200 11.709500 7.577210 8.83916 11.83320 8.615350 5.235500 4.320390 13.18320 10.586000 17.61940 9.843090 4.769250 10.28070 9.197060 10.439800 45.325000 9.60459 16.12840 8.428370 17.68990 10.706900 11.088400 12.33750 10.963637 9.843090 7.151625 0.652304 1.230762
FAT1 9.168520 0.832707 14.09520 0.030670 30.861900 4.647230 14.380400 12.861700 25.208100 22.40770 10.79140 5.060020 14.736400 3.793710 14.49690 0.945414 26.32040 10.668300 6.832860 9.89293 15.523700 16.306000 20.645700 21.05160 1.06991 8.754920 8.76238 13.789500 16.211200 11.21820 12.378852 12.378852 7.653034 0.618235 1.324488
PIK3R1 43.808500 12.812500 41.39910 8.652840 65.225100 18.599600 82.222000 70.123700 29.069200 89.52970 62.07810 15.500800 10.461800 23.951900 45.04570 32.863100 154.41700 42.912000 4.593430 11.82190 32.623900 61.103500 40.691500 29.14980 29.73640 13.794900 9.37332 29.429100 78.282500 57.94370 41.573886 32.863100 30.995987 0.745564 5.295056
APC 6.605980 5.592570 7.43279 1.818650 10.715500 18.867100 6.849220 7.786930 6.941440 9.01507 7.55060 2.739140 3.322110 2.588500 7.71383 6.257740 9.07154 5.962710 2.504500 6.96671 5.815600 5.401110 6.832000 9.80838 7.52095 3.727000 5.90447 6.467770 8.493860 7.47966 6.791781 6.832000 3.069611 0.451960 0.553060
EGFR 25.838200 14.641900 23.52960 0.045054 33.197800 4.058470 32.942200 24.717400 15.049800 31.80160 28.12270 3.118030 10.674600 16.914200 22.12730 6.789730 43.12280 20.735900 6.849740 4.23149 27.050900 31.626000 78.337400 12.85580 6.27020 11.007500 5.73138 25.038900 21.399800 40.60310 20.947650 20.947650 15.424758 0.736348 2.635818
BRAF 10.253200 6.026650 12.94650 5.531190 15.834100 4.306610 12.915000 15.536900 7.914150 11.93420 16.70070 3.309360 9.119940 3.530330 15.58570 4.557720 16.64820 16.938500 4.189910 14.88180 13.493700 8.488820 14.049000 8.96973 9.55425 7.642390 20.27730 17.941700 17.036600 13.57060 11.322825 11.934200 4.811849 0.424969 0.880892
KMT2C 11.473600 10.375000 15.13260 3.549480 19.118600 6.448020 15.411700 20.116800 10.774900 15.13570 17.60890 3.947850 6.156600 4.236210 17.29130 12.863800 21.66580 21.985600 7.782120 15.13580 12.518300 14.767600 20.062600 12.83970 12.54590 12.108000 19.88150 20.865800 22.443900 16.71820 14.032063 14.767600 5.439919 0.387678 1.018452
CDKN2A 0.584439 2.961140 0.93688 0.284581 0.990091 0.321283 0.786676 0.732269 0.496462 1.17289 1.49742 0.062258 0.146927 0.099371 1.07895 0.063926 1.20160 0.355287 0.215579 16.91060 0.942697 0.595078 0.363026 1.56748 2.57154 0.653275 7.12985 0.685411 0.404633 1.11324 1.564162 0.732269 3.089061 1.974898 0.515613
NOTCH1 30.417800 6.057770 30.03090 23.846200 24.725700 9.219590 28.396200 32.328500 28.781900 30.92170 29.39820 7.713940 18.436400 6.804660 47.51070 3.903660 38.90630 9.344980 9.692100 11.58590 29.249700 17.192100 41.740200 25.99350 45.23820 9.155010 6.39858 25.925300 33.055200 33.95680 23.197590 25.925300 12.310439 0.530677 2.173933
PTEN 45.952900 18.134600 28.76010 40.266500 41.009700 9.984610 34.776800 59.709600 19.588700 23.88120 50.96780 8.023610 14.551500 11.184400 31.91420 9.232020 47.55700 31.651900 10.067000 18.23860 27.249800 13.686600 21.251300 20.84830 35.77400 15.626900 20.65910 31.541100 43.559400 40.28480 27.531135 27.249800 13.610540 0.494369 2.428552
ATM 6.551680 6.980230 8.72422 3.503130 7.146340 3.957580 6.500810 12.772900 5.773010 6.46393 10.99050 2.184370 3.289240 2.923890 9.39258 2.088400 13.72480 18.273800 6.165060 15.34280 7.073830 6.389930 4.764550 9.20250 17.40080 5.249450 6.67047 9.957470 12.351600 8.71088 8.017358 6.980230 4.149998 0.517627 0.731643
KRAS 12.768600 9.108890 18.35200 9.088150 17.653600 6.816110 12.798200 13.308500 17.908800 22.16990 14.91450 3.658620 9.441710 6.288990 19.75970 5.028150 30.73640 12.961800 5.031810 9.70208 12.403100 14.301600 16.301900 16.58580 12.83920 14.984000 10.87170 15.033500 16.682800 18.42210 13.530740 13.308500 5.527565 0.408519 1.021173
KMT2D 13.587600 13.002500 18.24660 8.413190 18.442600 4.364470 17.131500 23.218700 12.841500 16.49320 21.97640 6.038790 8.574580 5.282640 23.43320 13.320800 24.73360 25.383000 9.015830 21.84720 18.186500 16.253500 23.492400 17.91740 22.39790 13.750000 21.58280 29.906100 29.968700 17.33260 17.204527 17.332600 6.685613 0.388596 1.251478
RB1 21.201200 17.170800 16.96180 6.785730 20.910400 8.952800 19.351000 21.072200 11.782600 14.73370 17.57230 6.696830 10.291600 4.049380 23.74090 12.726700 26.40190 15.455200 4.358140 12.01380 15.201400 18.177300 10.452200 14.37740 19.57970 9.392200 12.04230 21.379800 20.861500 22.06940 15.192073 15.201400 5.736453 0.377595 1.081228
AKT1 54.403800 73.611400 59.25930 26.834800 82.262400 19.560800 58.732900 71.120200 49.871500 83.67260 66.78540 29.234800 37.933200 32.997700 80.29900 22.800000 66.19450 53.725100 27.502900 36.38890 71.547300 51.641500 40.959300 46.36900 60.90140 50.206600 32.70520 68.032000 70.769200 55.25400 52.719223 53.725100 18.084952 0.343043 3.515221
CREBBP 22.830200 13.589200 32.14840 15.825700 41.780200 6.476420 29.244800 48.085600 20.105800 29.24990 36.54190 7.310060 10.408500 8.460790 27.47410 18.709700 38.69820 49.607400 9.415040 21.50880 27.507300 21.658600 34.862100 25.74240 28.58870 15.575700 36.98450 38.768000 53.406500 32.89360 26.781937 27.474100 12.493853 0.466503 2.251718
TP53 17.803500 13.500000 23.76590 7.698100 20.534000 3.893520 24.557400 29.501100 17.198300 12.50440 28.16160 3.678650 13.012000 6.868620 16.91990 5.443010 24.16180 32.427300 7.118010 8.45352 20.300200 18.834600 36.960900 18.41640 26.59310 12.633600 14.82610 19.663100 29.499700 25.29200 18.007344 18.007344 8.631200 0.479316 1.546453
NF1 7.388770 8.167140 9.66135 1.673030 12.509000 5.694150 9.062560 11.755900 6.261990 8.26854 10.72990 3.218220 4.429420 2.610770 9.66089 4.118920 14.86350 12.685700 3.382570 9.52086 8.715800 8.248040 7.234050 6.57104 4.85321 4.936360 12.05380 14.062000 12.854000 8.90397 8.136515 8.248040 3.451513 0.424200 0.630685
ERBB2 15.692600 5.675120 36.49920 1.248780 65.220100 6.267110 36.905700 50.025800 51.180100 33.50870 47.86210 21.170700 64.822300 12.568400 47.77960 10.820400 130.13100 38.722900 18.577700 17.11660 83.773800 83.832500 118.260000 44.17610 8.67758 34.499700 21.87070 99.307700 48.032100 78.81190 44.434566 38.722900 32.621162 0.734139 5.579469
SMARCA4 25.607600 28.193200 36.97740 8.694800 31.560400 20.617500 31.945800 44.716900 27.494700 28.36600 38.56260 11.186700 34.395300 8.662540 40.33670 19.023600 36.94870 39.168300 12.453300 39.34950 40.245200 40.062200 51.108700 28.69420 32.22920 23.902100 81.65430 48.184300 39.941300 46.80050 33.236118 33.236118 14.213998 0.427667 2.606449
APOE 262.706000 3543.250000 80.79180 2.765300 24.098100 1673.380000 321.464000 172.052000 115.864000 13.84010 111.93600 20.678500 150.055000 3181.760000 271.18900 8.651740 77.56430 566.518000 13.414200 103.44800 35.348800 63.819300 318.601000 172.87200 671.30300 34.673100 191.28500 199.905000 56.073400 131.92900 419.707855 131.929000 835.295789 1.990184 139.953545
{% endraw %} {% raw %}
gt = gtpm[gtpm.columns[1:]]
gt.set_index('Description', inplace=True)
org_cols = gt.columns

#gtt = gt.transpose()
#gtt.head()
gt.head()
GTEX-1117F-0226-SM-5GZZ7 GTEX-1117F-0426-SM-5EGHI GTEX-1117F-0526-SM-5EGHJ GTEX-1117F-0626-SM-5N9CS GTEX-1117F-0726-SM-5GIEN GTEX-1117F-1326-SM-5EGHH GTEX-1117F-2426-SM-5EGGH GTEX-1117F-2526-SM-5GZY6 GTEX-1117F-2826-SM-5GZXL GTEX-1117F-2926-SM-5GZYI GTEX-1117F-3026-SM-5GZYU GTEX-1117F-3226-SM-5N9CT GTEX-111CU-0126-SM-5GZWZ GTEX-111CU-0226-SM-5GZXC GTEX-111CU-0326-SM-5GZXO GTEX-111CU-0426-SM-5GZY1 GTEX-111CU-0526-SM-5EGHK GTEX-111CU-0626-SM-5EGHL GTEX-111CU-0726-SM-5GZYD GTEX-111CU-0826-SM-5EGIJ GTEX-111CU-0926-SM-5EGIK GTEX-111CU-1026-SM-5EGIL GTEX-111CU-1126-SM-5EGIM GTEX-111CU-1226-SM-5EGIN GTEX-111CU-1326-SM-5NQ8L GTEX-111CU-1426-SM-5GZYP GTEX-111CU-1526-SM-5N9FS GTEX-111CU-1726-SM-5EGHM GTEX-111CU-1826-SM-5GZYN GTEX-111CU-2026-SM-5GZZC ... GTEX-ZZ64-1526-SM-5E43K GTEX-ZZ64-1626-SM-5E43W GTEX-ZZ64-1726-SM-5GZYB GTEX-ZZPT-0226-SM-5E43X GTEX-ZZPT-0626-SM-5GZXT GTEX-ZZPT-0926-SM-5GICZ GTEX-ZZPT-1326-SM-5E43H GTEX-ZZPT-1426-SM-5N9C5 GTEX-ZZPT-2926-SM-5EQ5S GTEX-ZZPT-3026-SM-5GZXH GTEX-ZZPU-0126-SM-5E446 GTEX-ZZPU-0226-SM-5N9BV GTEX-ZZPU-0326-SM-5N9BJ GTEX-ZZPU-0426-SM-5GZYH GTEX-ZZPU-0526-SM-5E44U GTEX-ZZPU-0626-SM-5E43T GTEX-ZZPU-0726-SM-5N9C8 GTEX-ZZPU-0826-SM-5GZX5 GTEX-ZZPU-0926-SM-5GZYT GTEX-ZZPU-1026-SM-5E457 GTEX-ZZPU-1126-SM-5N9CW GTEX-ZZPU-1226-SM-5N9CK GTEX-ZZPU-1326-SM-5GZWS GTEX-ZZPU-1426-SM-5GZZ6 GTEX-ZZPU-1826-SM-5E43L GTEX-ZZPU-2126-SM-5EGIU GTEX-ZZPU-2226-SM-5EGIV GTEX-ZZPU-2426-SM-5E44I GTEX-ZZPU-2626-SM-5E45Y GTEX-ZZPU-2726-SM-5NQ8O
Description
MTOR 12.79 13.700 14.79 15.45 6.296 8.68 22.71 14.93 14.94 9.055 15.58 10.590 12.42 21.23 17.97 17.02 6.041 22.20 16.18 19.83 14.19 8.643 10.33 23.82 20.36 12.86 19.02 44.54 18.71 23.610 ... 15.830 19.50 15.34 16.92 26.95 7.426 15.58 15.85 19.92 12.720 10.92 15.09 15.97 11.790 15.44 20.45 7.513 13.24 10.920 11.15 11.78 14.94 18.81 11.200 20.02 12.82 13.19 16.79 20.600 15.19
SPEN 37.22 32.910 87.38 33.10 11.830 21.19 43.72 33.65 26.62 22.560 26.30 14.380 15.33 40.09 30.87 22.51 9.733 32.98 21.33 28.09 13.74 17.630 27.16 24.21 25.62 17.69 19.62 24.45 41.31 19.210 ... 21.420 32.51 36.84 40.36 37.09 9.889 32.26 24.37 80.00 16.170 15.08 27.44 19.32 7.684 24.81 40.39 13.800 28.20 11.050 22.67 15.85 16.86 32.05 15.520 32.49 50.25 21.86 52.99 18.480 28.13
ARID1A 22.81 13.970 20.35 28.26 9.919 20.47 37.84 39.81 24.35 21.830 29.43 11.010 21.95 47.54 25.36 36.80 10.760 28.67 29.84 32.67 29.71 17.730 22.01 32.04 27.35 17.07 24.79 42.67 38.64 20.370 ... 17.360 29.00 41.02 29.43 21.55 7.293 28.46 20.93 31.13 15.730 32.55 35.47 29.71 12.670 31.88 41.52 13.250 23.76 15.090 22.25 20.75 26.75 42.60 25.240 29.71 67.65 30.18 32.11 17.010 27.22
VHL 22.10 9.959 16.77 21.48 8.596 16.99 25.42 28.32 19.43 18.490 20.46 5.300 16.52 21.26 19.32 27.98 7.408 11.98 15.10 16.83 12.76 7.381 13.28 13.49 13.37 14.95 15.84 17.06 19.02 8.517 ... 8.717 14.21 18.40 16.09 14.33 5.158 21.41 19.96 26.89 8.975 18.91 16.95 18.08 9.331 19.36 21.49 6.848 16.79 5.504 10.27 8.21 20.60 21.57 9.735 12.89 21.06 15.40 11.66 6.208 17.22
SETD2 31.61 19.910 35.49 27.82 11.080 21.28 41.47 31.24 22.92 27.180 21.72 8.886 16.93 35.43 21.80 25.56 10.790 30.53 22.05 29.02 17.66 13.310 20.96 31.02 29.68 15.61 21.69 40.43 31.62 27.170 ... 24.220 34.69 37.30 33.41 28.33 9.775 33.57 26.57 87.63 13.980 18.05 23.14 18.06 9.445 22.33 35.87 10.650 23.36 10.580 15.28 13.38 21.87 35.02 18.270 30.15 44.00 18.34 38.11 22.260 31.16

5 rows × 17382 columns

{% endraw %} {% raw %}
print(gt.loc['TP53'].describe())
print(gt.loc['ERBB2'].describe())
print(gt.loc['PTEN'].describe())
count    17382.000000
mean        17.440633
std         13.114563
min          0.534400
25%          6.570500
50%         15.725000
75%         24.280000
max        106.000000
Name: TP53, dtype: float64
count    17382.000000
mean        45.592241
std         44.235695
min          0.091940
25%         10.040000
50%         30.300000
75%         71.627500
max        923.400000
Name: ERBB2, dtype: float64
count    17382.000000
mean        27.343233
std         17.688800
min          1.163000
25%         14.430000
50%         23.550000
75%         36.100000
max        164.900000
Name: PTEN, dtype: float64
{% endraw %} {% raw %}
#gt.loc['PTEN'].skew()
#gt.loc['PTEN'].kurtosis()
gt.loc['PTEN'].quantile(0.9)

gtpm.head()
#gt.T[TEST_GENES[:-1]].iplot(kind='box',)
#gt.T[['TP53', 'PTEN']].plot(kind='box',)
Name Description GTEX-1117F-0226-SM-5GZZ7 GTEX-1117F-0426-SM-5EGHI GTEX-1117F-0526-SM-5EGHJ GTEX-1117F-0626-SM-5N9CS GTEX-1117F-0726-SM-5GIEN GTEX-1117F-1326-SM-5EGHH GTEX-1117F-2426-SM-5EGGH GTEX-1117F-2526-SM-5GZY6 GTEX-1117F-2826-SM-5GZXL GTEX-1117F-2926-SM-5GZYI GTEX-1117F-3026-SM-5GZYU GTEX-1117F-3226-SM-5N9CT GTEX-111CU-0126-SM-5GZWZ GTEX-111CU-0226-SM-5GZXC GTEX-111CU-0326-SM-5GZXO GTEX-111CU-0426-SM-5GZY1 GTEX-111CU-0526-SM-5EGHK GTEX-111CU-0626-SM-5EGHL GTEX-111CU-0726-SM-5GZYD GTEX-111CU-0826-SM-5EGIJ GTEX-111CU-0926-SM-5EGIK GTEX-111CU-1026-SM-5EGIL GTEX-111CU-1126-SM-5EGIM GTEX-111CU-1226-SM-5EGIN GTEX-111CU-1326-SM-5NQ8L GTEX-111CU-1426-SM-5GZYP GTEX-111CU-1526-SM-5N9FS GTEX-111CU-1726-SM-5EGHM ... GTEX-ZZ64-1526-SM-5E43K GTEX-ZZ64-1626-SM-5E43W GTEX-ZZ64-1726-SM-5GZYB GTEX-ZZPT-0226-SM-5E43X GTEX-ZZPT-0626-SM-5GZXT GTEX-ZZPT-0926-SM-5GICZ GTEX-ZZPT-1326-SM-5E43H GTEX-ZZPT-1426-SM-5N9C5 GTEX-ZZPT-2926-SM-5EQ5S GTEX-ZZPT-3026-SM-5GZXH GTEX-ZZPU-0126-SM-5E446 GTEX-ZZPU-0226-SM-5N9BV GTEX-ZZPU-0326-SM-5N9BJ GTEX-ZZPU-0426-SM-5GZYH GTEX-ZZPU-0526-SM-5E44U GTEX-ZZPU-0626-SM-5E43T GTEX-ZZPU-0726-SM-5N9C8 GTEX-ZZPU-0826-SM-5GZX5 GTEX-ZZPU-0926-SM-5GZYT GTEX-ZZPU-1026-SM-5E457 GTEX-ZZPU-1126-SM-5N9CW GTEX-ZZPU-1226-SM-5N9CK GTEX-ZZPU-1326-SM-5GZWS GTEX-ZZPU-1426-SM-5GZZ6 GTEX-ZZPU-1826-SM-5E43L GTEX-ZZPU-2126-SM-5EGIU GTEX-ZZPU-2226-SM-5EGIV GTEX-ZZPU-2426-SM-5E44I GTEX-ZZPU-2626-SM-5E45Y GTEX-ZZPU-2726-SM-5NQ8O
0 ENSG00000198793.12 MTOR 12.79 13.700 14.79 15.45 6.296 8.68 22.71 14.93 14.94 9.055 15.58 10.590 12.42 21.23 17.97 17.02 6.041 22.20 16.18 19.83 14.19 8.643 10.33 23.82 20.36 12.86 19.02 44.54 ... 15.830 19.50 15.34 16.92 26.95 7.426 15.58 15.85 19.92 12.720 10.92 15.09 15.97 11.790 15.44 20.45 7.513 13.24 10.920 11.15 11.78 14.94 18.81 11.200 20.02 12.82 13.19 16.79 20.600 15.19
1 ENSG00000065526.10 SPEN 37.22 32.910 87.38 33.10 11.830 21.19 43.72 33.65 26.62 22.560 26.30 14.380 15.33 40.09 30.87 22.51 9.733 32.98 21.33 28.09 13.74 17.630 27.16 24.21 25.62 17.69 19.62 24.45 ... 21.420 32.51 36.84 40.36 37.09 9.889 32.26 24.37 80.00 16.170 15.08 27.44 19.32 7.684 24.81 40.39 13.800 28.20 11.050 22.67 15.85 16.86 32.05 15.520 32.49 50.25 21.86 52.99 18.480 28.13
2 ENSG00000117713.18 ARID1A 22.81 13.970 20.35 28.26 9.919 20.47 37.84 39.81 24.35 21.830 29.43 11.010 21.95 47.54 25.36 36.80 10.760 28.67 29.84 32.67 29.71 17.730 22.01 32.04 27.35 17.07 24.79 42.67 ... 17.360 29.00 41.02 29.43 21.55 7.293 28.46 20.93 31.13 15.730 32.55 35.47 29.71 12.670 31.88 41.52 13.250 23.76 15.090 22.25 20.75 26.75 42.60 25.240 29.71 67.65 30.18 32.11 17.010 27.22
3 ENSG00000134086.7 VHL 22.10 9.959 16.77 21.48 8.596 16.99 25.42 28.32 19.43 18.490 20.46 5.300 16.52 21.26 19.32 27.98 7.408 11.98 15.10 16.83 12.76 7.381 13.28 13.49 13.37 14.95 15.84 17.06 ... 8.717 14.21 18.40 16.09 14.33 5.158 21.41 19.96 26.89 8.975 18.91 16.95 18.08 9.331 19.36 21.49 6.848 16.79 5.504 10.27 8.21 20.60 21.57 9.735 12.89 21.06 15.40 11.66 6.208 17.22
4 ENSG00000181555.20 SETD2 31.61 19.910 35.49 27.82 11.080 21.28 41.47 31.24 22.92 27.180 21.72 8.886 16.93 35.43 21.80 25.56 10.790 30.53 22.05 29.02 17.66 13.310 20.96 31.02 29.68 15.61 21.69 40.43 ... 24.220 34.69 37.30 33.41 28.33 9.775 33.57 26.57 87.63 13.980 18.05 23.14 18.06 9.445 22.33 35.87 10.650 23.36 10.580 15.28 13.38 21.87 35.02 18.270 30.15 44.00 18.34 38.11 22.260 31.16

5 rows × 17384 columns

{% endraw %} {% raw %}
import math

#gene = 'TP53'
gene = 'PTEN'
title = '{}: Samples {}'.format(gene, len(gt.loc[gene]))
#gtt[gene].iplot(kind='hist', title=title, yTitle='counts', xTitle='Expression (TPM)', bins=100)

#py.iplot(gt.loc[gene].to_list(), kind='hist', title=title, yTitle='counts', xTitle='Expression (TPM)', bins=100)
#gt.loc[gene].iplot(kind='hist', title=title, yTitle='counts', xTitle='Expression (TPM)', bins=100)
#gt.loc[gene].plot(kind='hist', title=title, bins=100)

glog2 = gt.loc[gene].apply(math.log2)
glog2.plot(kind='hist', title=title, bins=100)

px.histogram(gt.loc[gene])
{% endraw %} {% raw %}
#print(ff.create_distplot.__doc__)
gene_list = ['TP53', 'PTEN', 'KRAS', 'ERBB2', 'CDKN2A']
gene_list = ['TP53', 'PTEN', 'KRAS',]
px.histogram([gt.loc[g] for g in gene_list], gene_list)
{% endraw %} {% raw %}
gene = 'TP53'
data_lists = []
labels = []
for tissue in tissue_dict.keys():
    samples = tissue_dict[tissue]['samples']
    # print(tissue)
    # print(len(samples))
    gt_samples = [s for s in samples if s in gt.columns]
    if len(gt_samples) > 20:
        dt = gt.loc[gene][gt_samples].dropna()
        if not dt.empty:
            data_lists.append(dt)
            labels.append('{}({}) ~ {}'.format(tissue, len(samples), round(dt.median(), 1)))
    else:
        print('Only {} samples for "{}"'.format(len(gt_samples), tissue))
# add all samples
dt = gt.loc[gene].dropna()
data_lists.append(dt)
labels.append('{} - all ({}) ~ {}'.format(gene, len(dt), round(dt.median(), 1)))

fig_tp53 = ff.create_distplot(data_lists, labels, show_hist=False, show_rug=False, show_curve=True)  # curve_type='normal')
fig_tp53.layout.update(title=gene)
fig_tp53.show()
Only 0 samples for "Bone Marrow"
Only 19 samples for "Cervix Uteri"
Only 9 samples for "Fallopian Tube"
{% endraw %} {% raw %}
dir(go)
gene='PTEN'
tpm_label = "TPM      {} skew: {} kurtosis: {}".format(gene,
                                                       round(gt.loc[gene].skew(), 2),
                                                       round(gt.loc[gene].kurtosis(), 2))
log2_label = "log2TPM {} skew: {} kurtosis: {}".format(gene,
                                                       round(gt.loc[gene].apply(np.log2).skew(), 2),
                                                       round(gt.loc[gene].apply(np.log2).kurtosis(), 2))

fig_log2vsNormal = ff.create_distplot([gt.loc[gene].apply(np.log2), gt.loc[gene]],
                         [log2_label, tpm_label],
                         bin_size=[0.1, 2],
                         curve_type='normal',)
fig_log2vsNormal.show()
{% endraw %}

Data structure

Data like TPM is lognormal distributed.

  • There cannot be a negative number of transcripts.
  • log transforms (like fold-change - log 2) are more normally disturbuted.

    More irregular curves, like TP53 show more log-normal distributions when clustered into relevent groups, like tissue types.

    Otherwise, something like sklearn.preprocessing.RobustScaler looks good for IQR scaling - robust to outliers.

    This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range). The IQR is the range between the 1st quartile (25th quantile) and the 3rd quartile (75th quantile).

Correlation

Gene expressions are correlated / anti-correlated (mutual information/effect) in interaction pathways. 'Activation' of certiain pathways is of interest for cancer and drug effects measures.

PCA with 'whitening'

{% raw %}
# Data is now effectively in fold change units

# Use a minum value to prevent div by zero.
#safelog2 = lambda x: numpy.log2(x) if x else numpy.log2(sys.float_info.min)
safelog2 = lambda x: np.log2(x) if x else np.nan
gfc = gt.applymap(safelog2)

print(gfc.info())
print(gfc.head())
<class 'pandas.core.frame.DataFrame'>
Index: 28 entries, MTOR to APOE
Columns: 17382 entries, GTEX-1117F-0226-SM-5GZZ7 to GTEX-ZZPU-2726-SM-5NQ8O
dtypes: float64(17382)
memory usage: 3.7+ MB
None
             GTEX-1117F-0226-SM-5GZZ7  GTEX-1117F-0426-SM-5EGHI  GTEX-1117F-0526-SM-5EGHJ  GTEX-1117F-0626-SM-5N9CS  \
Description                                                                                                           
MTOR                         3.676944                  3.776104                  3.886550                  3.949535   
SPEN                         5.218006                  5.040454                  6.449231                  5.048759   
ARID1A                       4.511595                  3.804260                  4.346957                  4.820690   
VHL                          4.465974                  3.316001                  4.067811                  4.424922   
SETD2                        4.982309                  4.315421                  5.149341                  4.798051   

             GTEX-1117F-0726-SM-5GIEN  GTEX-1117F-1326-SM-5EGHH  GTEX-1117F-2426-SM-5EGGH  GTEX-1117F-2526-SM-5GZY6  \
Description                                                                                                           
MTOR                         2.654436                  3.117695                  4.505256                  3.900142   
SPEN                         3.564378                  4.405312                  5.450221                  5.072535   
ARID1A                       3.310195                  4.355439                  5.241840                  5.315059   
VHL                          3.103665                  4.086614                  4.667892                  4.823749   
SETD2                        3.469886                  4.411426                  5.373996                  4.965323   

             GTEX-1117F-2826-SM-5GZXL  GTEX-1117F-2926-SM-5GZYI  GTEX-1117F-3026-SM-5GZYU  GTEX-1117F-3226-SM-5N9CT  \
Description                                                                                                           
MTOR                         3.901108                  3.178715                  3.961623                  3.404631   
SPEN                         4.734439                  4.495695                  4.716991                  3.845992   
ARID1A                       4.605850                  4.448240                  4.879216                  3.460743   
VHL                          4.280214                  4.208673                  4.354734                  2.405992   
SETD2                        4.518535                  4.764474                  4.440952                  3.151534   

             GTEX-111CU-0126-SM-5GZWZ  GTEX-111CU-0226-SM-5GZXC  GTEX-111CU-0326-SM-5GZXO  GTEX-111CU-0426-SM-5GZY1  \
Description                                                                                                           
MTOR                         3.634593                  4.408032                  4.167519                  4.089159   
SPEN                         3.938286                  5.325171                  4.948134                  4.492494   
ARID1A                       4.456149                  5.571070                  4.664483                  5.201634   
VHL                          4.046142                  4.410070                  4.272023                  4.806324   
SETD2                        4.081510                  5.146900                  4.446256                  4.675816   

             GTEX-111CU-0526-SM-5EGHK  GTEX-111CU-0626-SM-5EGHL  GTEX-111CU-0726-SM-5GZYD  GTEX-111CU-0826-SM-5EGIJ  \
Description                                                                                                           
MTOR                         2.594787                  4.472488                  4.016140                  4.309613   
SPEN                         3.282885                  5.043519                  4.414812                  4.811985   
ARID1A                       3.427606                  4.841470                  4.899176                  5.029895   
VHL                          2.889084                  3.582556                  3.916477                  4.072963   
SETD2                        3.431623                  4.932156                  4.462707                  4.858976   

             GTEX-111CU-0926-SM-5EGIK  GTEX-111CU-1026-SM-5EGIL  GTEX-111CU-1126-SM-5EGIM  GTEX-111CU-1226-SM-5EGIN  \
Description                                                                                                           
MTOR                         3.826803                  3.111532                  3.368768                  4.574102   
SPEN                         3.780310                  4.139961                  4.763412                  4.597531   
ARID1A                       4.892877                  4.148121                  4.460087                  5.001802   
VHL                          3.673556                  2.883816                  3.731183                  3.753818   
SETD2                        4.142413                  3.734439                  4.389567                  4.955127   

             GTEX-111CU-1326-SM-5NQ8L  GTEX-111CU-1426-SM-5GZYP  GTEX-111CU-1526-SM-5N9FS  GTEX-111CU-1726-SM-5EGHM  \
Description                                                                                                           
MTOR                         4.347666                  3.684819                  4.249445                  5.477030   
SPEN                         4.679199                  4.144862                  4.294253                  4.611763   
ARID1A                       4.773469                  4.093391                  4.631686                  5.415150   
VHL                          3.740928                  3.902074                  3.985500                  4.092546   
SETD2                        4.891419                  3.964399                  4.438958                  5.337354   

             GTEX-111CU-1826-SM-5GZYN  GTEX-111CU-2026-SM-5GZZC  ...  GTEX-ZZ64-1526-SM-5E43K  \
Description                                                      ...                            
MTOR                         4.225738                  4.561326  ...                 3.984589   
SPEN                         5.368419                  4.263786  ...                 4.420887   
ARID1A                       5.272023                  4.348374  ...                 4.117695   
VHL                          4.249445                  3.090345  ...                 3.123832   
SETD2                        4.982765                  4.763943  ...                 4.598127   

             GTEX-ZZ64-1626-SM-5E43W  GTEX-ZZ64-1726-SM-5GZYB  GTEX-ZZPT-0226-SM-5E43X  GTEX-ZZPT-0626-SM-5GZXT  \
Description                                                                                                       
MTOR                        4.285402                 3.939227                 4.080658                 4.752213   
SPEN                        5.022812                 5.203201                 5.334854                 5.212958   
ARID1A                      4.857981                 5.358256                 4.879216                 4.429616   
VHL                         3.828835                 4.201634                 4.008092                 3.840967   
SETD2                       5.116448                 5.221104                 5.062208                 4.824259   

             GTEX-ZZPT-0926-SM-5GICZ  GTEX-ZZPT-1326-SM-5E43H  GTEX-ZZPT-1426-SM-5N9C5  GTEX-ZZPT-2926-SM-5EQ5S  \
Description                                                                                                       
MTOR                        2.892585                 3.961623                 3.986411                 4.316146   
SPEN                        3.305825                 5.011675                 4.607034                 6.321928   
ARID1A                      2.866512                 4.830864                 4.387500                 4.960234   
VHL                         2.366812                 4.420213                 4.319040                 4.748998   
SETD2                       3.289097                 5.069101                 4.731726                 6.453353   

             GTEX-ZZPT-3026-SM-5GZXH  GTEX-ZZPU-0126-SM-5E446  GTEX-ZZPU-0226-SM-5N9BV  GTEX-ZZPU-0326-SM-5N9BJ  \
Description                                                                                                       
MTOR                        3.669027                 3.448901                 3.915521                 3.997292   
SPEN                        4.015248                 3.914565                 4.778209                 4.272023   
ARID1A                      3.975447                 5.024586                 5.148527                 4.892877   
VHL                         3.165912                 4.241077                 4.083213                 4.176323   
SETD2                       3.805292                 4.173927                 4.532317                 4.174726   

             GTEX-ZZPU-0426-SM-5GZYH  GTEX-ZZPU-0526-SM-5E44U  GTEX-ZZPU-0626-SM-5E43T  GTEX-ZZPU-0726-SM-5N9C8  \
Description                                                                                                       
MTOR                        3.559492                 3.948601                 4.354029                 2.909389   
SPEN                        2.941858                 4.632850                 5.335926                 3.786596   
ARID1A                      3.663345                 4.994580                 5.375735                 3.727920   
VHL                         3.222032                 4.275007                 4.425594                 2.775683   
SETD2                       3.239551                 4.480911                 5.164706                 3.412782   

             GTEX-ZZPU-0826-SM-5GZX5  GTEX-ZZPU-0926-SM-5GZYT  GTEX-ZZPU-1026-SM-5E457  GTEX-ZZPU-1126-SM-5N9CW  \
Description                                                                                                       
MTOR                        3.726831                 3.448901                 3.478972                 3.558268   
SPEN                        4.817623                 3.465974                 4.502712                 3.986411   
ARID1A                      4.570463                 3.915521                 4.475733                 4.375039   
VHL                         4.069530                 2.460480                 3.360364                 3.037382   
SETD2                       4.545968                 3.403268                 3.933573                 3.742006   

             GTEX-ZZPU-1226-SM-5N9CK  GTEX-ZZPU-1326-SM-5GZWS  GTEX-ZZPU-1426-SM-5GZZ6  GTEX-ZZPU-1826-SM-5E43L  \
Description                                                                                                       
MTOR                        3.901108                 4.233428                 3.485427                 4.323370   
SPEN                        4.075533                 5.002252                 3.956057                 5.021924   
ARID1A                      4.741467                 5.412782                 4.657640                 4.892877   
VHL                         4.364572                 4.430954                 3.283181                 3.688180   
SETD2                       4.450881                 5.130107                 4.191405                 4.914086   

             GTEX-ZZPU-2126-SM-5EGIU  GTEX-ZZPU-2226-SM-5EGIV  GTEX-ZZPU-2426-SM-5E44I  GTEX-ZZPU-2626-SM-5E45Y  \
Description                                                                                                       
MTOR                        3.680324                 3.721373                 4.069530                 4.364572   
SPEN                        5.651052                 4.450221                 5.727648                 4.207893   
ARID1A                      6.080018                 4.915521                 5.004951                 4.088311   
VHL                         4.396434                 3.944858                 3.543496                 2.634129   
SETD2                       5.459432                 4.196922                 5.252098                 4.476382   

             GTEX-ZZPU-2726-SM-5NQ8O  
Description                           
MTOR                        3.925050  
SPEN                        4.814038  
ARID1A                      4.766595  
VHL                         4.106013  
SETD2                       4.961623  

[5 rows x 17382 columns]
{% endraw %} {% raw %}
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler

# Which way are we scaling? Rows or columns?
rs = RobustScaler()
scaler = rs.fit(gfc.T)
gfc_norm = pandas.DataFrame(scaler.transform(gfc.T), index=gfc.columns, columns=gfc.index)
gfc_norm.dropna(inplace=True)
print(gfc_norm.head())
gfc_norm.describe()
Description                   MTOR      SPEN    ARID1A       VHL     SETD2     PBRM1    PIK3CA     FBXW7      FAT1  \
GTEX-1117F-0226-SM-5GZZ7 -0.148990  0.500753 -0.076494  0.651610  0.357947  0.219159  0.657689  0.137941  0.315719   
GTEX-1117F-0426-SM-5EGHI -0.035703  0.369781 -0.714965 -0.533909 -0.193806 -0.973633 -0.204115  0.442740 -1.616288   
GTEX-1117F-0526-SM-5EGHJ  0.090478  1.408967 -0.225103  0.241139  0.496142 -0.051157  0.834038 -0.087266 -0.240140   
GTEX-1117F-0626-SM-5N9CS  0.162436  0.375907  0.202509  0.609288  0.205500  0.361176  0.751966  0.279533  0.210570   
GTEX-1117F-0726-SM-5GIEN -1.317176 -0.719048 -1.160931 -0.752808 -0.893365 -0.819850 -0.339990 -0.905276 -0.563476   

Description                 PIK3R1       APC      EGFR      BRAF     KMT2C    CDKN2A    NOTCH1      PTEN       ATM  \
GTEX-1117F-0226-SM-5GZZ7  0.697855 -0.016439  0.845291  0.483317  0.713148  0.650181  0.269254  0.645109  0.693011   
GTEX-1117F-0426-SM-5EGHI  0.839000 -1.259885 -0.359273 -0.319380 -0.093061 -0.308653 -0.997207 -0.249149 -1.039290   
GTEX-1117F-0526-SM-5EGHJ  1.083432  0.092859  0.468442  1.176535  0.952999  1.045006 -0.087333  0.976693  0.827301   
GTEX-1117F-0626-SM-5N9CS  0.266064  0.028675  0.020123  0.449438  0.114631  0.395039  0.186295  0.908214  0.600383   
GTEX-1117F-0726-SM-5GIEN -0.453203 -1.498813 -0.735492 -0.569445 -1.183474 -0.443965 -0.457604 -0.592367  0.004370   

Description                   KRAS     KMT2D       RB1      AKT1    CREBBP      TP53       NF1     ERBB2   SMARCA4  \
GTEX-1117F-0226-SM-5GZZ7  0.500282  0.799588  0.130113  0.925233  0.531411  0.153948  0.634519 -0.085982 -0.003709   
GTEX-1117F-0426-SM-5EGHI -1.230646 -0.031126 -0.119219 -0.485034 -0.020414 -0.674963 -0.451763 -0.587296 -0.878672   
GTEX-1117F-0526-SM-5EGHJ  1.320150  0.705832  0.243555  0.928997  0.748787 -0.102683  0.764680 -0.275321 -0.450558   
GTEX-1117F-0626-SM-5N9CS  0.502313  0.246376  0.798581  0.589489  0.154016  0.436752  0.379416  0.282353  0.024790   
GTEX-1117F-0726-SM-5GIEN -1.295053 -1.149230 -0.609899 -0.551651 -0.914004 -0.589620 -0.592024 -0.196869 -1.202438   

Description                   APOE  
GTEX-1117F-0226-SM-5GZZ7  0.199434  
GTEX-1117F-0426-SM-5EGHI -0.914346  
GTEX-1117F-0526-SM-5EGHJ -0.546150  
GTEX-1117F-0626-SM-5N9CS  0.301969  
GTEX-1117F-0726-SM-5GIEN  0.307977  
Description MTOR SPEN ARID1A VHL SETD2 PBRM1 PIK3CA FBXW7 FAT1 PIK3R1 APC EGFR BRAF KMT2C CDKN2A NOTCH1 PTEN ATM KRAS KMT2D RB1 AKT1 CREBBP TP53 NF1 ERBB2 SMARCA4 APOE
count 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000 17254.000000
mean -0.180154 -0.124893 -0.191584 -0.123302 -0.202052 -0.227457 -0.124120 0.131909 -0.289027 -0.017136 -0.010861 -0.237629 -0.153919 -0.179802 -0.008763 -0.130600 -0.056745 -0.060825 -0.177907 -0.185866 -0.126496 -0.096339 -0.165568 -0.170011 -0.124443 -0.098012 -0.107083 -0.085304
std 0.994860 0.732150 0.788455 0.872097 0.748139 0.751689 0.699316 1.062772 1.226803 0.768987 1.086254 0.930044 0.733446 0.837761 0.853479 0.622581 0.726810 0.760068 0.848593 0.835849 0.768758 0.724838 0.720886 0.688306 0.803387 0.650717 0.914930 0.684385
min -6.382162 -4.475624 -5.082872 -5.237329 -4.290571 -4.055744 -3.680743 -5.686402 -6.706602 -3.683612 -6.606164 -5.080566 -4.035095 -5.231510 -3.110570 -2.367126 -3.280455 -3.727971 -4.467900 -4.593823 -4.729838 -3.981207 -3.737603 -2.587375 -5.833611 -2.847050 -6.063147 -2.668327
25% -0.552938 -0.583061 -0.629290 -0.554890 -0.657347 -0.682448 -0.587164 -0.426906 -0.534171 -0.484642 -0.507518 -0.637421 -0.607544 -0.590769 -0.508958 -0.627931 -0.526629 -0.541836 -0.605389 -0.605918 -0.578234 -0.543230 -0.618634 -0.654262 -0.554209 -0.555103 -0.487614 -0.570331
50% 0.001177 0.003884 0.005908 0.005206 0.003054 0.005172 0.005377 0.001318 0.006000 0.001582 0.002041 0.006536 0.005341 0.003287 0.001417 0.006658 0.005773 0.005438 0.006806 0.002332 0.003989 0.007014 0.004557 0.006058 0.006111 0.005430 0.003699 0.006185
75% 0.444170 0.413185 0.360695 0.432292 0.337764 0.311691 0.409298 0.574786 0.454304 0.513734 0.490753 0.357142 0.381714 0.407422 0.490911 0.366244 0.467575 0.447838 0.384411 0.392779 0.420094 0.451410 0.377927 0.335494 0.436389 0.441178 0.504379 0.421183
max 2.277862 2.010655 1.446708 2.973618 2.316551 1.343447 1.636754 3.843333 2.118834 2.228243 4.909765 1.279549 1.980743 1.978984 2.484339 1.629158 2.122408 2.524300 1.994245 1.742493 2.701591 1.869702 1.491306 1.459905 1.959968 1.738975 2.294416 1.639815
{% endraw %} {% raw %}
gene = 'PTEN'
data_lists = []
labels = []
for tissue in tissue_dict.keys():
    samples = tissue_dict[tissue]['samples']
    # print(tissue)
    # print(len(samples))
    gt_samples = [s for s in samples if s in gfc_norm.index]
    if len(gt_samples) > 20:
        dt = gfc_norm[gene][gt_samples].dropna()
        if not dt.empty:
            data_lists.append(dt)
            labels.append('{}({}) ~ {}'.format(tissue, len(samples), round(dt.median(), 1)))
    else:
        print('Only {} samples for "{}"'.format(len(gt_samples), tissue))
# add all samples
dt = gfc_norm[gene].dropna()
data_lists.append(dt)
labels.append('{} - all ({}) ~ {}'.format(gene, len(dt), round(dt.median(), 1)))

fig_log2tissue = ff.create_distplot(data_lists, labels, show_hist=False, show_rug=False, show_curve=True)  # curve_type='normal')
fig_log2tissue.layout.update(title=gene)
fig_log2tissue.show()
Only 0 samples for "Bone Marrow"
Only 19 samples for "Cervix Uteri"
Only 9 samples for "Fallopian Tube"
{% endraw %} {% raw %}
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA

import sklearn.decomposition
print('sklearn.decomposition')
pprint([f for f in dir(sklearn.decomposition) if str(f)[0] != '_'])

import sklearn.discriminant_analysis
print('sklearn.discriminant_analysis')
pprint([f for f in dir(sklearn.discriminant_analysis) if str(f)[0] != '_'])

print('sklearn.ensemble')
import sklearn.ensemble
pprint([f for f in dir(sklearn.ensemble) if str(f)[0] != '_'])

print('sklearn.metrics')
import sklearn.metrics
pprint([f for f in dir(sklearn.metrics) if str(f)[0] != '_'])
pprint(dir(sklearn.metrics))
sklearn.decomposition
['DictionaryLearning',
 'FactorAnalysis',
 'FastICA',
 'IncrementalPCA',
 'KernelPCA',
 'LatentDirichletAllocation',
 'MiniBatchDictionaryLearning',
 'MiniBatchSparsePCA',
 'NMF',
 'PCA',
 'SparseCoder',
 'SparsePCA',
 'TruncatedSVD',
 'dict_learning',
 'dict_learning_online',
 'fastica',
 'non_negative_factorization',
 'randomized_svd',
 'sparse_encode',
 'warnings']
sklearn.discriminant_analysis
['BaseEstimator',
 'ClassifierMixin',
 'LinearClassifierMixin',
 'LinearDiscriminantAnalysis',
 'QuadraticDiscriminantAnalysis',
 'StandardScaler',
 'TransformerMixin',
 'check_array',
 'check_classification_targets',
 'check_is_fitted',
 'empirical_covariance',
 'expit',
 'ledoit_wolf',
 'linalg',
 'np',
 'shrunk_covariance',
 'softmax',
 'unique_labels',
 'warnings']
sklearn.ensemble
['AdaBoostClassifier',
 'AdaBoostRegressor',
 'BaggingClassifier',
 'BaggingRegressor',
 'BaseEnsemble',
 'ExtraTreesClassifier',
 'ExtraTreesRegressor',
 'GradientBoostingClassifier',
 'GradientBoostingRegressor',
 'IsolationForest',
 'RandomForestClassifier',
 'RandomForestRegressor',
 'RandomTreesEmbedding',
 'StackingClassifier',
 'StackingRegressor',
 'VotingClassifier',
 'VotingRegressor',
 'typing']
sklearn.metrics
['ConfusionMatrixDisplay',
 'PrecisionRecallDisplay',
 'RocCurveDisplay',
 'SCORERS',
 'accuracy_score',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'auc',
 'average_precision_score',
 'balanced_accuracy_score',
 'brier_score_loss',
 'calinski_harabasz_score',
 'check_scoring',
 'classification_report',
 'cluster',
 'cohen_kappa_score',
 'completeness_score',
 'confusion_matrix',
 'consensus_score',
 'coverage_error',
 'davies_bouldin_score',
 'dcg_score',
 'euclidean_distances',
 'explained_variance_score',
 'f1_score',
 'fbeta_score',
 'fowlkes_mallows_score',
 'get_scorer',
 'hamming_loss',
 'hinge_loss',
 'homogeneity_completeness_v_measure',
 'homogeneity_score',
 'jaccard_score',
 'label_ranking_average_precision_score',
 'label_ranking_loss',
 'log_loss',
 'make_scorer',
 'matthews_corrcoef',
 'max_error',
 'mean_absolute_error',
 'mean_gamma_deviance',
 'mean_poisson_deviance',
 'mean_squared_error',
 'mean_squared_log_error',
 'mean_tweedie_deviance',
 'median_absolute_error',
 'multilabel_confusion_matrix',
 'mutual_info_score',
 'nan_euclidean_distances',
 'ndcg_score',
 'normalized_mutual_info_score',
 'pairwise',
 'pairwise_distances',
 'pairwise_distances_argmin',
 'pairwise_distances_argmin_min',
 'pairwise_distances_chunked',
 'pairwise_kernels',
 'plot_confusion_matrix',
 'plot_precision_recall_curve',
 'plot_roc_curve',
 'precision_recall_curve',
 'precision_recall_fscore_support',
 'precision_score',
 'r2_score',
 'recall_score',
 'roc_auc_score',
 'roc_curve',
 'silhouette_samples',
 'silhouette_score',
 'v_measure_score',
 'zero_one_loss']
['ConfusionMatrixDisplay',
 'PrecisionRecallDisplay',
 'RocCurveDisplay',
 'SCORERS',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_base',
 '_classification',
 '_pairwise_fast',
 '_plot',
 '_ranking',
 '_regression',
 '_scorer',
 'accuracy_score',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'auc',
 'average_precision_score',
 'balanced_accuracy_score',
 'brier_score_loss',
 'calinski_harabasz_score',
 'check_scoring',
 'classification_report',
 'cluster',
 'cohen_kappa_score',
 'completeness_score',
 'confusion_matrix',
 'consensus_score',
 'coverage_error',
 'davies_bouldin_score',
 'dcg_score',
 'euclidean_distances',
 'explained_variance_score',
 'f1_score',
 'fbeta_score',
 'fowlkes_mallows_score',
 'get_scorer',
 'hamming_loss',
 'hinge_loss',
 'homogeneity_completeness_v_measure',
 'homogeneity_score',
 'jaccard_score',
 'label_ranking_average_precision_score',
 'label_ranking_loss',
 'log_loss',
 'make_scorer',
 'matthews_corrcoef',
 'max_error',
 'mean_absolute_error',
 'mean_gamma_deviance',
 'mean_poisson_deviance',
 'mean_squared_error',
 'mean_squared_log_error',
 'mean_tweedie_deviance',
 'median_absolute_error',
 'multilabel_confusion_matrix',
 'mutual_info_score',
 'nan_euclidean_distances',
 'ndcg_score',
 'normalized_mutual_info_score',
 'pairwise',
 'pairwise_distances',
 'pairwise_distances_argmin',
 'pairwise_distances_argmin_min',
 'pairwise_distances_chunked',
 'pairwise_kernels',
 'plot_confusion_matrix',
 'plot_precision_recall_curve',
 'plot_roc_curve',
 'precision_recall_curve',
 'precision_recall_fscore_support',
 'precision_score',
 'r2_score',
 'recall_score',
 'roc_auc_score',
 'roc_curve',
 'silhouette_samples',
 'silhouette_score',
 'v_measure_score',
 'zero_one_loss']
{% endraw %} {% raw %}
# sparse PCA is probably better - interpret which genes are more important.

pca3 = PCA(n_components=3)  # Alt opt PCA(0.99) - enough components for 99% of variance
sparse_pca = sklearn.decomposition.SparsePCA(n_components=3)

sparse_comp = sparse_pca.fit_transform(gfc_norm)
components3 = pca3.fit_transform(gfc_norm)

print(pandas.DataFrame(components3).head())
pca3.explained_variance_ratio_
          0         1         2
0 -2.792102  0.425145 -0.614332
1  1.543815  0.794585  0.039227
2 -3.102035  1.219043 -0.225359
3 -2.452548  0.656707 -0.298955
4  3.368639  0.083306 -1.012018
array([0.57294997, 0.09913305, 0.07949034])
{% endraw %} {% raw %}
sparse_comp = pandas.DataFrame(sparse_pca.components_, columns=gfc_norm.columns)
print(sparse_comp)
pc1 = sparse_comp.loc[0]
#pc1_genes = [g.index for g in pc1 if g != 0]
print(pc1)
pc1 = pc1.apply(abs)
#print(pc1.sort_values.__doc__)
pc1.sort_values(ascending=False, inplace=True)
print(pc1)
pc1_gene_list =[gene for gene in pc1.index if pc1[gene] != 0]
print(pc1_gene_list)
Description      MTOR      SPEN    ARID1A       VHL     SETD2     PBRM1    PIK3CA     FBXW7      FAT1    PIK3R1  \
0           -0.204108 -0.208621 -0.255312 -0.274413 -0.235343 -0.211983 -0.207278 -0.089023  0.000000 -0.119284   
1           -0.151156 -0.083812 -0.026289  0.127043 -0.040954 -0.134514 -0.061783  0.000000 -0.699149 -0.143657   
2            0.209030  0.031002  0.013188  0.126413  0.031577 -0.015873 -0.011097  0.491592  0.090493  0.058389   

Description       APC      EGFR      BRAF     KMT2C    CDKN2A    NOTCH1      PTEN       ATM      KRAS     KMT2D  \
0            0.000000 -0.030375 -0.208920 -0.220681 -0.148508 -0.141698 -0.240484 -0.221595 -0.229653 -0.264523   
1           -0.053347 -0.509385  0.000000 -0.086462  0.038754 -0.039891  0.115955  0.036659 -0.007470 -0.007883   
2            0.646706  0.000000  0.117339  0.111699  0.054104 -0.123048 -0.051550  0.028121  0.095945  0.000000   

Description       RB1      AKT1    CREBBP      TP53       NF1     ERBB2   SMARCA4      APOE  
0           -0.221874 -0.201674 -0.231002 -0.187521 -0.154808 -0.065193 -0.177401  0.075653  
1            0.005923 -0.085638 -0.016669 -0.074272 -0.119456 -0.289114 -0.072660 -0.127148  
2            0.036701 -0.069100 -0.008859 -0.163078  0.220803 -0.104270  0.259649  0.222539  
Description
MTOR      -0.204108
SPEN      -0.208621
ARID1A    -0.255312
VHL       -0.274413
SETD2     -0.235343
PBRM1     -0.211983
PIK3CA    -0.207278
FBXW7     -0.089023
FAT1       0.000000
PIK3R1    -0.119284
APC        0.000000
EGFR      -0.030375
BRAF      -0.208920
KMT2C     -0.220681
CDKN2A    -0.148508
NOTCH1    -0.141698
PTEN      -0.240484
ATM       -0.221595
KRAS      -0.229653
KMT2D     -0.264523
RB1       -0.221874
AKT1      -0.201674
CREBBP    -0.231002
TP53      -0.187521
NF1       -0.154808
ERBB2     -0.065193
SMARCA4   -0.177401
APOE       0.075653
Name: 0, dtype: float64
Description
VHL        0.274413
KMT2D      0.264523
ARID1A     0.255312
PTEN       0.240484
SETD2      0.235343
CREBBP     0.231002
KRAS       0.229653
RB1        0.221874
ATM        0.221595
KMT2C      0.220681
PBRM1      0.211983
BRAF       0.208920
SPEN       0.208621
PIK3CA     0.207278
MTOR       0.204108
AKT1       0.201674
TP53       0.187521
SMARCA4    0.177401
NF1        0.154808
CDKN2A     0.148508
NOTCH1     0.141698
PIK3R1     0.119284
FBXW7      0.089023
APOE       0.075653
ERBB2      0.065193
EGFR       0.030375
APC        0.000000
FAT1       0.000000
Name: 0, dtype: float64
['VHL', 'KMT2D', 'ARID1A', 'PTEN', 'SETD2', 'CREBBP', 'KRAS', 'RB1', 'ATM', 'KMT2C', 'PBRM1', 'BRAF', 'SPEN', 'PIK3CA', 'MTOR', 'AKT1', 'TP53', 'SMARCA4', 'NF1', 'CDKN2A', 'NOTCH1', 'PIK3R1', 'FBXW7', 'APOE', 'ERBB2', 'EGFR']
{% endraw %} {% raw %}
gtm = pandas.read_csv(gtex_tpm_med_fn, sep='\t', skiprows=2, low_memory=False)

gtm.set_index(['Description', 'Name'], inplace=True)

#tissues = [(col.split(' - ')[0], col.replace(col.split(' - ')[0] + ' - ', '')) for col in gtm.columns]
tissues = [col.split(' - ')[0] for col in gtm.columns]
subtypes =  [col.replace(col.split(' - ')[0] + ' - ', '') for col in gtm.columns]

gtm.columns = [tissues, subtypes]
print(gtm.head())

print(gtm.loc['TP53', 'Brain'])

# Makes some reasonable subtables.  With droplevel seems to work ok.
                                   Adipose                    Adrenal Gland   Artery                    Bladder  \
                              Subcutaneous Visceral (Omentum) Adrenal Gland    Aorta Coronary   Tibial  Bladder   
Description Name                                                                                                  
DDX11L1     ENSG00000223972.5      0.00000            0.00000       0.00000  0.00000  0.00000  0.00000  0.00000   
WASH7P      ENSG00000227232.5      4.06403            3.37111       2.68549  4.04762  3.90076  3.63963  5.16375   
MIR6859-1   ENSG00000278267.1      0.00000            0.00000       0.00000  0.00000  0.00000  0.00000  0.00000   
MIR1302-2HG ENSG00000243485.5      0.00000            0.00000       0.00000  0.00000  0.00000  0.00000  0.00000   
FAM138A     ENSG00000237613.2      0.00000            0.00000       0.00000  0.00000  0.00000  0.00000  0.00000   

                                 Brain                                                                                 \
                              Amygdala Anterior cingulate cortex (BA24) Caudate (basal ganglia) Cerebellar Hemisphere   
Description Name                                                                                                        
DDX11L1     ENSG00000223972.5  0.00000                          0.00000                0.000000               0.00000   
WASH7P      ENSG00000227232.5  1.43859                          1.69285                1.566050               4.99231   
MIR6859-1   ENSG00000278267.1  0.00000                          0.00000                0.000000               0.00000   
MIR1302-2HG ENSG00000243485.5  0.00000                          0.00000                0.024264               0.00000   
FAM138A     ENSG00000237613.2  0.00000                          0.00000                0.000000               0.00000   

                                                                                                  \
                              Cerebellum    Cortex Frontal Cortex (BA9) Hippocampus Hypothalamus   
Description Name                                                                                   
DDX11L1     ENSG00000223972.5    0.00000  0.000000             0.000000     0.00000     0.000000   
WASH7P      ENSG00000227232.5    5.72099  2.483170             2.146670     1.68599     1.748110   
MIR6859-1   ENSG00000278267.1    0.00000  0.000000             0.000000     0.00000     0.000000   
MIR1302-2HG ENSG00000243485.5    0.00000  0.027366             0.030382     0.00000     0.024714   
FAM138A     ENSG00000237613.2    0.00000  0.000000             0.000000     0.00000     0.000000   

                                                                                                                    \
                              Nucleus accumbens (basal ganglia) Putamen (basal ganglia) Spinal cord (cervical c-1)   
Description Name                                                                                                     
DDX11L1     ENSG00000223972.5                          0.000000                0.000000                    0.00000   
WASH7P      ENSG00000227232.5                          1.538990                1.441670                    2.73049   
MIR6859-1   ENSG00000278267.1                          0.000000                0.000000                    0.00000   
MIR1302-2HG ENSG00000243485.5                          0.030669                0.023474                    0.00000   
FAM138A     ENSG00000237613.2                          0.000000                0.000000                    0.00000   

                                                       Breast                Cells                              \
                              Substantia nigra Mammary Tissue Cultured fibroblasts EBV-transformed lymphocytes   
Description Name                                                                                                 
DDX11L1     ENSG00000223972.5         0.000000        0.00000               0.0000                     0.00000   
WASH7P      ENSG00000227232.5         1.741940        4.43876               1.6786                     2.49477   
MIR6859-1   ENSG00000278267.1         0.000000        0.00000               0.0000                     0.00000   
MIR1302-2HG ENSG00000243485.5         0.019526        0.00000               0.0000                     0.00000   
FAM138A     ENSG00000237613.2         0.000000        0.00000               0.0000                     0.00000   

                                  Cervix               Colon                            Esophagus                      \
                              Ectocervix Endocervix  Sigmoid Transverse Gastroesophageal Junction   Mucosa Muscularis   
Description Name                                                                                                        
DDX11L1     ENSG00000223972.5    0.00000    0.00000  0.00000    0.00000                   0.00000  0.00000    0.00000   
WASH7P      ENSG00000227232.5    5.62935    7.09749  4.64777    3.59509                   4.32641  3.11749    4.10335   
MIR6859-1   ENSG00000278267.1    0.00000    0.00000  0.00000    0.00000                   0.00000  0.00000    0.00000   
MIR1302-2HG ENSG00000243485.5    0.00000    0.00000  0.00000    0.00000                   0.00000  0.00000    0.00000   
FAM138A     ENSG00000237613.2    0.00000    0.00000  0.00000    0.00000                   0.00000  0.00000    0.00000   

                              Fallopian Tube            Heart                   Kidney             Liver     Lung  \
                              Fallopian Tube Atrial Appendage Left Ventricle    Cortex  Medulla    Liver     Lung   
Description Name                                                                                                    
DDX11L1     ENSG00000223972.5        0.00000          0.00000       0.000000  0.000000  0.00000  0.00000  0.00000   
WASH7P      ENSG00000227232.5        6.13409          1.52031       0.924962  2.770810  2.21451  1.76541  4.50841   
MIR6859-1   ENSG00000278267.1        0.00000          0.00000       0.000000  0.000000  0.00000  0.00000  0.00000   
MIR1302-2HG ENSG00000243485.5        0.00000          0.00000       0.017893  0.017665  0.00000  0.00000  0.00000   
FAM138A     ENSG00000237613.2        0.00000          0.00000       0.000000  0.000000  0.00000  0.00000  0.00000   

                              Minor Salivary Gland   Muscle    Nerve   Ovary Pancreas Pituitary Prostate  \
                              Minor Salivary Gland Skeletal   Tibial   Ovary Pancreas Pituitary Prostate   
Description Name                                                                                           
DDX11L1     ENSG00000223972.5              0.00000  0.00000  0.00000  0.0000  0.00000   0.00000  0.00000   
WASH7P      ENSG00000227232.5              3.52767  1.41667  6.68531  6.6341  1.80871   5.42546  7.08318   
MIR6859-1   ENSG00000278267.1              0.00000  0.00000  0.00000  0.0000  0.00000   0.00000  0.00000   
MIR1302-2HG ENSG00000243485.5              0.00000  0.00000  0.00000  0.0000  0.00000   0.00000  0.00000   
FAM138A     ENSG00000237613.2              0.00000  0.00000  0.00000  0.0000  0.00000   0.00000  0.00000   

                                                      Skin                         Small Intestine   Spleen  Stomach  \
                              Not Sun Exposed (Suprapubic) Sun Exposed (Lower leg)  Terminal Ileum   Spleen  Stomach   
Description Name                                                                                                       
DDX11L1     ENSG00000223972.5                      0.00000                 0.00000         0.00000  0.00000  0.00000   
WASH7P      ENSG00000227232.5                      5.93298                 6.13265         4.19378  5.92631  3.06248   
MIR6859-1   ENSG00000278267.1                      0.00000                 0.00000         0.00000  0.00000  0.00000   
MIR1302-2HG ENSG00000243485.5                      0.00000                 0.00000         0.00000  0.00000  0.00000   
FAM138A     ENSG00000237613.2                      0.00000                 0.00000         0.00000  0.00000  0.00000   

                                 Testis  Thyroid   Uterus   Vagina Whole Blood  
                                 Testis  Thyroid   Uterus   Vagina Whole Blood  
Description Name                                                                
DDX11L1     ENSG00000223972.5  0.166403  0.00000  0.00000  0.00000     0.00000  
WASH7P      ENSG00000227232.5  4.702530  6.27255  7.19001  5.74554     2.64743  
MIR6859-1   ENSG00000278267.1  0.000000  0.00000  0.00000  0.00000     0.00000  
MIR1302-2HG ENSG00000243485.5  0.054223  0.00000  0.00000  0.00000     0.00000  
FAM138A     ENSG00000237613.2  0.000000  0.00000  0.00000  0.00000     0.00000  
                    Amygdala  Anterior cingulate cortex (BA24)  Caudate (basal ganglia)  Cerebellar Hemisphere  \
Name                                                                                                             
ENSG00000141510.16   3.66265                           3.13225                  4.02247                2.02662   

                    Cerebellum   Cortex  Frontal Cortex (BA9)  Hippocampus  Hypothalamus  \
Name                                                                                       
ENSG00000141510.16     2.44665  3.65122               2.90659      2.71056       3.15486   

                    Nucleus accumbens (basal ganglia)  Putamen (basal ganglia)  Spinal cord (cervical c-1)  \
Name                                                                                                         
ENSG00000141510.16                            3.37169                  3.32102                     4.16092   

                    Substantia nigra  
Name                                  
ENSG00000141510.16           3.89352  
{% endraw %} {% raw %}
gtm.loc[:, 'Brain']
#gtm.loc[:, gtm.columns.get_level_values(1) == 'Cortex']
Amygdala Anterior cingulate cortex (BA24) Caudate (basal ganglia) Cerebellar Hemisphere Cerebellum Cortex Frontal Cortex (BA9) Hippocampus Hypothalamus Nucleus accumbens (basal ganglia) Putamen (basal ganglia) Spinal cord (cervical c-1) Substantia nigra
Description Name
DDX11L1 ENSG00000223972.5 0.00000 0.00000 0.000000 0.00000 0.00000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.00000 0.000000
WASH7P ENSG00000227232.5 1.43859 1.69285 1.566050 4.99231 5.72099 2.483170 2.146670 1.68599 1.748110 1.538990 1.441670 2.73049 1.741940
MIR6859-1 ENSG00000278267.1 0.00000 0.00000 0.000000 0.00000 0.00000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.00000 0.000000
MIR1302-2HG ENSG00000243485.5 0.00000 0.00000 0.024264 0.00000 0.00000 0.027366 0.030382 0.00000 0.024714 0.030669 0.023474 0.00000 0.019526
FAM138A ENSG00000237613.2 0.00000 0.00000 0.000000 0.00000 0.00000 0.000000 0.000000 0.00000 0.000000 0.000000 0.000000 0.00000 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
MT-ND6 ENSG00000198695.2 9631.77000 8319.14000 11516.600000 4967.36000 5779.40000 7361.480000 6904.070000 9409.32000 9606.640000 10239.100000 12166.000000 6903.48000 8547.190000
MT-TE ENSG00000210194.1 40.58860 31.35490 78.695300 15.12030 17.25060 25.281300 22.448400 33.90560 34.389900 43.712000 45.240400 19.09150 49.059800
MT-CYB ENSG00000198727.2 32931.80000 28346.70000 40960.000000 21992.00000 27120.50000 30873.600000 24999.900000 35185.30000 31077.800000 38008.100000 44653.400000 24606.80000 30844.500000
MT-TT ENSG00000210195.2 3.44476 3.47988 5.581470 1.83920 1.90798 2.987390 2.759550 3.86473 3.165480 3.943170 5.053390 2.83474 8.182420
MT-TP ENSG00000210196.2 5.67736 5.40242 7.071810 2.55710 2.77638 4.589150 4.768040 6.07535 5.551080 5.300060 7.360820 4.54090 11.164100

56200 rows × 13 columns

{% endraw %}

Full size - slow run

{% raw %}
logger.info("Starting g8 load")
_g8 = pandas.read_csv(
        gtex_tpm_fn,
        skiprows=2,  # First 2 rows are comments of file size - other func here to select patient groups
        sep='\t',
        header=0,
        # usecols=sample_list,  # Number of samples limit
    )
logger.info("\tfinished g8 load")
g8 = _g8.drop('gene_id', axis=1)
g8 = g8.set_index('Description')
g8.rename(columns=rename_cols, inplace=True)
[I 210115 18:49:40 <ipython-input-25-cb330cc29dbd>:1] Starting g8 load
{% endraw %} {% raw %}
g8.head()
{% endraw %}